From 972c1dc75fd45bb0b786715765f1587c104194e9 Mon Sep 17 00:00:00 2001 From: Erwin de Haan Date: Mon, 8 Sep 2014 00:36:08 +0200 Subject: [PATCH] Made ChanRipper more universal. Added a nice way to add extra chan sites. This makes sure that the files are the correct ones (self_hosted). Generic sites still work. Check http://www.allchans.org/ sometime. And 4chan-x for the list of archives. The "Can't rip this url" error now gives the message. Added ChanSite Helper class. Updates ChanRipperTest urls. testVineboxAlbums is still failing. --- .../ripme/ripper/rippers/ChanRipper.java | 141 ++++++++++++------ .../rippers/ripperhelpers/ChanSite.java | 35 +++++ .../com/rarchives/ripme/ui/MainWindow.java | 2 +- .../tst/ripper/rippers/ChanRipperTest.java | 7 +- 4 files changed, 135 insertions(+), 50 deletions(-) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/ripperhelpers/ChanSite.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java index 6f16d8c4..c76e1b7f 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java @@ -12,12 +12,48 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite; import com.rarchives.ripme.utils.Http; +import java.util.Arrays; public class ChanRipper extends AbstractHTMLRipper { - + + //ArrayList explicit_domains = new ArrayList(); + public static List explicit_domains = Arrays.asList( + //Tested (main boards) + //Untested (main boards) + new ChanSite(Arrays.asList("anon-ib.com")), + new ChanSite(Arrays.asList("boards.4chan.org"),Arrays.asList("4cdn.org")), + //Tested (archives) + new ChanSite(Arrays.asList("archive.moe"),Arrays.asList("data.archive.moe")), //4chan archive (successor of foolz archive) Archives: [ a / biz / c / co / diy / gd / i / int / jp / m / mlp / out / po / q / s4s / sci / sp / tg / tv / v / vg / vp / vr / wsg ] + //Untested (archives)new ChanSite(Arrays.asList("anon-ib.com")), + new ChanSite(Arrays.asList("4archive.org"),Arrays.asList("imgur.com")), //4chan archive (on demand) + new ChanSite(Arrays.asList("archive.4plebs.org"),Arrays.asList("img.4plebs.org")), //4chan archive Archives: [ adv / f / hr / o / pol / s4s / tg / trv / tv / x ] Boards: [ plebs ] + new ChanSite(Arrays.asList("fgts.jp"),Arrays.asList("dat.fgts.jp")) //4chan archive Archives: [ asp / cm / h / hc / hm / n / p / r / s / soc / y ] + ); + public static List url_piece_blacklist = Arrays.asList( + "=http", + "http://imgops.com/", + "iqdb.org", + "saucenao.com" + ); + + public ChanSite chanSite; + public Boolean generalChanSite = true; + public ChanRipper(URL url) throws IOException { super(url); + for (ChanSite _chanSite : explicit_domains) { + for (String host : _chanSite.domains) { + if (url.getHost().equals(host)) { + chanSite = _chanSite; + generalChanSite = false; + } + } + } + if(chanSite==null){ + chanSite = new ChanSite(Arrays.asList("url.getHost()")); + } } @Override @@ -33,39 +69,40 @@ public class ChanRipper extends AbstractHTMLRipper { } @Override - public boolean canRip(URL url) { - // TODO Whitelist? - if (url.getHost().equals("anon-ib.com")) { - return true; + public boolean canRip(URL url) { + //explicit_domains testing + for (ChanSite _chanSite : explicit_domains) { + for (String host : _chanSite.domains) { + if (url.getHost().equals(host)) { + return true; + } + } } - return url.getHost().contains("chan") && - ( url.toExternalForm().contains("/res/") // Most chans - || url.toExternalForm().contains("/thread/")); // 4chan + //It'll fail further down the road. + return url.toExternalForm().contains("/res/") // Most chans + || url.toExternalForm().contains("/thread/"); // 4chan, archive.moe } - + /** + * For example the achrives are all known. (Check 4chan-x) + * Should be based on the software the specific chan uses. + * FoolFuuka uses the same (url) layout as 4chan + * */ @Override public String getGID(URL url) throws MalformedURLException { Pattern p; Matcher m; - String u = url.toExternalForm(); - if (u.contains("/res/")) { - p = Pattern.compile("^.*(chan|anon-ib).*\\.[a-z]{2,3}/[a-zA-Z0-9/]+/res/([0-9]+)(\\.html|\\.php)?.*$"); + String u = url.toExternalForm(); + if (u.contains("/thread/")||u.contains("/res/")) { + p = Pattern.compile("^.*\\.[a-z]{1,3}/[a-zA-Z0-9]+/(thread|res)/([0-9]+)(\\.html|\\.php)?.*$"); m = p.matcher(u); if (m.matches()) { return m.group(2); } } - else if (u.contains("/thread/")) { - p = Pattern.compile("^.*chan.*\\.[a-z]{2,3}/[a-zA-Z0-9]+/thread/([0-9]+)(\\.html|\\.php)?.*$"); - m = p.matcher(u); - if (m.matches()) { - return m.group(1); - } - } throw new MalformedURLException( "Expected *chan URL formats: " - + "*chan.com/@/res/####.html" + + ".*/@/(res|thread)/####.html" + " Got: " + u); } @@ -83,37 +120,48 @@ public class ChanRipper extends AbstractHTMLRipper { public List getURLsFromPage(Document page) { List imageURLs = new ArrayList(); Pattern p; Matcher m; + elementloop: for (Element link : page.select("a")) { if (!link.hasAttr("href")) { continue; } - if (!link.attr("href").contains("/src/") - && !link.attr("href").contains("4cdn.org")) { - logger.debug("Skipping link that does not contain /src/: " + link.attr("href")); - continue; + String href = link.attr("href"); + + //Check all blacklist items + for(String blacklist_item : url_piece_blacklist){ + if (href.contains(blacklist_item)){ + logger.debug("Skipping link that contains '"+blacklist_item+"': " + href); + continue elementloop; + } } - if (link.attr("href").contains("=http") - || link.attr("href").contains("http://imgops.com/")) { - logger.debug("Skipping link that contains '=http' or 'imgops.com': " + link.attr("href")); - continue; + Boolean self_hosted = false; + if(!generalChanSite){ + for(String cdnDomain : chanSite.cdnDomains){ + if (href.contains(cdnDomain)){ + self_hosted = true; + } + } } - p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|webm)$", Pattern.CASE_INSENSITIVE); - m = p.matcher(link.attr("href")); - if (m.matches()) { - String image = link.attr("href"); - if (image.startsWith("//")) { - image = "http:" + image; + if(self_hosted||generalChanSite){ + p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm)$", Pattern.CASE_INSENSITIVE); + m = p.matcher(href); + if (m.matches()) { + if (href.startsWith("//")) { + href = "http:" + href; + } + if (href.startsWith("/")) { + href = "http://" + this.url.getHost() + href; + } + // Don't download the same URL twice + if (imageURLs.contains(href)) { + logger.debug("Already attempted: " + href); + continue; + } + imageURLs.add(href); } - if (image.startsWith("/")) { - image = "http://" + this.url.getHost() + image; - } - // Don't download the same URL twice - if (imageURLs.contains(image)) { - logger.debug("Already attempted: " + image); - continue; - } - imageURLs.add(image); - } + } else { + //TODO also grab imgur/flickr albums (And all other supported rippers) Maybe add a setting? + } } return imageURLs; } @@ -121,6 +169,5 @@ public class ChanRipper extends AbstractHTMLRipper { @Override public void downloadURL(URL url, int index) { addURLToDownload(url, getPrefix(index)); - } - -} \ No newline at end of file + } +} diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ripperhelpers/ChanSite.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ripperhelpers/ChanSite.java new file mode 100644 index 00000000..f049d2f5 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ripperhelpers/ChanSite.java @@ -0,0 +1,35 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package com.rarchives.ripme.ripper.rippers.ripperhelpers; + +import java.util.List; + +/** + * + * @author Erwin + */ +public class ChanSite { + //The domains where the threads are hosted. + public List domains; + //The domains where the images are hosted. + public List cdnDomains; + + public ChanSite(List Domains, List CdnDomains){ + if(Domains.isEmpty()) + throw new IllegalArgumentException("Domains"); + if(CdnDomains.isEmpty()) + throw new IllegalArgumentException("CdnDomains"); + domains = Domains; + cdnDomains = CdnDomains; + } + public ChanSite(List Domains){ + if(Domains.isEmpty()) + throw new IllegalArgumentException("Domains"); + domains = Domains; + cdnDomains = Domains; + } +} diff --git a/src/main/java/com/rarchives/ripme/ui/MainWindow.java b/src/main/java/com/rarchives/ripme/ui/MainWindow.java index 6a962e3a..0eebffce 100644 --- a/src/main/java/com/rarchives/ripme/ui/MainWindow.java +++ b/src/main/java/com/rarchives/ripme/ui/MainWindow.java @@ -448,7 +448,7 @@ public class MainWindow implements Runnable, RipStatusHandler { AbstractRipper ripper = AbstractRipper.getRipper(url); statusWithColor(ripper.getHost() + " album detected", Color.GREEN); } catch (Exception e) { - statusWithColor("Can't rip this URL", Color.RED); + statusWithColor("Can't rip this URL: "+e.getMessage(), Color.RED); } } }); diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java index 6f6a77c4..b1f48107 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java @@ -27,18 +27,20 @@ public class ChanRipperTest extends RippersTest { List passURLs = new ArrayList(); // URLs that should work passURLs.add(new URL("http://desuchan.net/v/res/7034.html")); - passURLs.add(new URL("http://boards.4chan.org/r/res/12225949")); + passURLs.add(new URL("http://boards.4chan.org/hr/thread/2214511")); + passURLs.add(new URL("http://fgts.jp/r/thread/12225949/")); passURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php")); passURLs.add(new URL("http://7chan.org/gif/res/23795.html")); passURLs.add(new URL("http://unichan2.org/b/res/518004.html")); passURLs.add(new URL("http://xchan.pw/porn/res/437.html")); + passURLs.add(new URL("http://archive.moe/c/thread/2295132/")); for (URL url : passURLs) { try { ChanRipper ripper = new ChanRipper(url); assert(ripper.canRip(url)); deleteDir(ripper.getWorkingDir()); } catch (Exception e) { - fail("Failed to instantiate ripper for " + url); + fail("Failed to instantiate ripper for " + url + " with message: "+e.toString()); } } } @@ -55,6 +57,7 @@ public class ChanRipperTest extends RippersTest { contentURLs.add(new URL("http://7chan.org/gif/res/23795.html")); contentURLs.add(new URL("http://unichan2.org/b/res/518004.html")); contentURLs.add(new URL("http://xchan.pw/porn/res/437.html")); + contentURLs.add(new URL("http://archive.4plebs.org/hr/thread/2215899/")); for (URL url : contentURLs) { try { ChanRipper ripper = new ChanRipper(url);