From 707f64c516567f9cfe9a3c8412c758a693b6508f Mon Sep 17 00:00:00 2001 From: 4pr0n Date: Fri, 14 Nov 2014 06:03:13 -0800 Subject: [PATCH] 1.0.91 - Cleaning up ChanRipper, fix to work with anonib --- pom.xml | 2 +- .../ripme/ripper/rippers/ChanRipper.java | 59 ++++++++----------- .../com/rarchives/ripme/ui/UpdateUtils.java | 2 +- 3 files changed, 26 insertions(+), 37 deletions(-) diff --git a/pom.xml b/pom.xml index a66e15c8..f02a83fb 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ com.rarchives.ripme ripme jar - 1.0.90 + 1.0.91 ripme http://rip.rarchives.com diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java index c76e1b7f..e4f27096 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java @@ -18,18 +18,12 @@ import java.util.Arrays; public class ChanRipper extends AbstractHTMLRipper { - //ArrayList explicit_domains = new ArrayList(); public static List explicit_domains = Arrays.asList( - //Tested (main boards) - //Untested (main boards) - new ChanSite(Arrays.asList("anon-ib.com")), - new ChanSite(Arrays.asList("boards.4chan.org"),Arrays.asList("4cdn.org")), - //Tested (archives) - new ChanSite(Arrays.asList("archive.moe"),Arrays.asList("data.archive.moe")), //4chan archive (successor of foolz archive) Archives: [ a / biz / c / co / diy / gd / i / int / jp / m / mlp / out / po / q / s4s / sci / sp / tg / tv / v / vg / vp / vr / wsg ] - //Untested (archives)new ChanSite(Arrays.asList("anon-ib.com")), - new ChanSite(Arrays.asList("4archive.org"),Arrays.asList("imgur.com")), //4chan archive (on demand) - new ChanSite(Arrays.asList("archive.4plebs.org"),Arrays.asList("img.4plebs.org")), //4chan archive Archives: [ adv / f / hr / o / pol / s4s / tg / trv / tv / x ] Boards: [ plebs ] - new ChanSite(Arrays.asList("fgts.jp"),Arrays.asList("dat.fgts.jp")) //4chan archive Archives: [ asp / cm / h / hc / hm / n / p / r / s / soc / y ] + new ChanSite(Arrays.asList("boards.4chan.org"), Arrays.asList("4cdn.org")), + new ChanSite(Arrays.asList("archive.moe"), Arrays.asList("data.archive.moe")), + new ChanSite(Arrays.asList("4archive.org"), Arrays.asList("imgur.com")), + new ChanSite(Arrays.asList("archive.4plebs.org"), Arrays.asList("img.4plebs.org")), + new ChanSite(Arrays.asList("fgts.jp"), Arrays.asList("dat.fgts.jp")) ); public static List url_piece_blacklist = Arrays.asList( "=http", @@ -44,15 +38,13 @@ public class ChanRipper extends AbstractHTMLRipper { public ChanRipper(URL url) throws IOException { super(url); for (ChanSite _chanSite : explicit_domains) { - for (String host : _chanSite.domains) { - if (url.getHost().equals(host)) { - chanSite = _chanSite; - generalChanSite = false; - } + if (_chanSite.domains.contains(url.getHost())) { + chanSite = _chanSite; + generalChanSite = false; } } - if(chanSite==null){ - chanSite = new ChanSite(Arrays.asList("url.getHost()")); + if (chanSite == null) { + chanSite = new ChanSite(Arrays.asList(url.getHost())); } } @@ -70,17 +62,13 @@ public class ChanRipper extends AbstractHTMLRipper { @Override public boolean canRip(URL url) { - //explicit_domains testing for (ChanSite _chanSite : explicit_domains) { - for (String host : _chanSite.domains) { - if (url.getHost().equals(host)) { - return true; - } - } + if (_chanSite.domains.contains(url.getHost())) { + return true; + } } - //It'll fail further down the road. - return url.toExternalForm().contains("/res/") // Most chans - || url.toExternalForm().contains("/thread/"); // 4chan, archive.moe + return url.toExternalForm().contains("/res/") // Most chans + || url.toExternalForm().contains("/thread/"); // 4chan, archive.moe } /** * For example the achrives are all known. (Check 4chan-x) @@ -92,7 +80,7 @@ public class ChanRipper extends AbstractHTMLRipper { Pattern p; Matcher m; String u = url.toExternalForm(); - if (u.contains("/thread/")||u.contains("/res/")) { + if (u.contains("/thread/") || u.contains("/res/")) { p = Pattern.compile("^.*\\.[a-z]{1,3}/[a-zA-Z0-9]+/(thread|res)/([0-9]+)(\\.html|\\.php)?.*$"); m = p.matcher(u); if (m.matches()) { @@ -125,27 +113,28 @@ public class ChanRipper extends AbstractHTMLRipper { if (!link.hasAttr("href")) { continue; } - String href = link.attr("href"); - + String href = link.attr("href").trim(); + //Check all blacklist items - for(String blacklist_item : url_piece_blacklist){ - if (href.contains(blacklist_item)){ + for (String blacklist_item : url_piece_blacklist) { + if (href.contains(blacklist_item)) { logger.debug("Skipping link that contains '"+blacklist_item+"': " + href); continue elementloop; } } Boolean self_hosted = false; - if(!generalChanSite){ - for(String cdnDomain : chanSite.cdnDomains){ + if (!generalChanSite) { + for (String cdnDomain : chanSite.cdnDomains) { if (href.contains(cdnDomain)){ self_hosted = true; } } } + if(self_hosted||generalChanSite){ p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm)$", Pattern.CASE_INSENSITIVE); m = p.matcher(href); - if (m.matches()) { + if (m.matches()) { if (href.startsWith("//")) { href = "http:" + href; } diff --git a/src/main/java/com/rarchives/ripme/ui/UpdateUtils.java b/src/main/java/com/rarchives/ripme/ui/UpdateUtils.java index 733c64a4..b49e6c90 100644 --- a/src/main/java/com/rarchives/ripme/ui/UpdateUtils.java +++ b/src/main/java/com/rarchives/ripme/ui/UpdateUtils.java @@ -21,7 +21,7 @@ import com.rarchives.ripme.utils.Utils; public class UpdateUtils { private static final Logger logger = Logger.getLogger(UpdateUtils.class); - private static final String DEFAULT_VERSION = "1.0.90"; + private static final String DEFAULT_VERSION = "1.0.91"; private static final String updateJsonURL = "http://rarchives.com/ripme.json"; private static final String updateJarURL = "http://rarchives.com/ripme.jar"; private static final String mainFileName = "ripme.jar";