Made ChanRipper more universal.

Added a nice way to add extra chan sites. This makes sure that the files
are the correct ones (self_hosted). Generic sites still work. Check
http://www.allchans.org/ sometime. And 4chan-x for the list of archives.
The "Can't rip this url" error now gives the message.
Added ChanSite Helper class.
Updates ChanRipperTest urls.

testVineboxAlbums is still failing.
This commit is contained in:
Erwin de Haan 2014-09-08 00:36:08 +02:00
parent e46e6733fa
commit 972c1dc75f
4 changed files with 135 additions and 50 deletions

View File

@ -12,12 +12,48 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import com.rarchives.ripme.ripper.AbstractHTMLRipper; import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite;
import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Http;
import java.util.Arrays;
public class ChanRipper extends AbstractHTMLRipper { public class ChanRipper extends AbstractHTMLRipper {
//ArrayList<String> explicit_domains = new ArrayList<String>();
public static List<ChanSite> explicit_domains = Arrays.asList(
//Tested (main boards)
//Untested (main boards)
new ChanSite(Arrays.asList("anon-ib.com")),
new ChanSite(Arrays.asList("boards.4chan.org"),Arrays.asList("4cdn.org")),
//Tested (archives)
new ChanSite(Arrays.asList("archive.moe"),Arrays.asList("data.archive.moe")), //4chan archive (successor of foolz archive) Archives: [ a / biz / c / co / diy / gd / i / int / jp / m / mlp / out / po / q / s4s / sci / sp / tg / tv / v / vg / vp / vr / wsg ]
//Untested (archives)new ChanSite(Arrays.asList("anon-ib.com")),
new ChanSite(Arrays.asList("4archive.org"),Arrays.asList("imgur.com")), //4chan archive (on demand)
new ChanSite(Arrays.asList("archive.4plebs.org"),Arrays.asList("img.4plebs.org")), //4chan archive Archives: [ adv / f / hr / o / pol / s4s / tg / trv / tv / x ] Boards: [ plebs ]
new ChanSite(Arrays.asList("fgts.jp"),Arrays.asList("dat.fgts.jp")) //4chan archive Archives: [ asp / cm / h / hc / hm / n / p / r / s / soc / y ]
);
public static List<String> url_piece_blacklist = Arrays.asList(
"=http",
"http://imgops.com/",
"iqdb.org",
"saucenao.com"
);
public ChanSite chanSite;
public Boolean generalChanSite = true;
public ChanRipper(URL url) throws IOException { public ChanRipper(URL url) throws IOException {
super(url); super(url);
for (ChanSite _chanSite : explicit_domains) {
for (String host : _chanSite.domains) {
if (url.getHost().equals(host)) {
chanSite = _chanSite;
generalChanSite = false;
}
}
}
if(chanSite==null){
chanSite = new ChanSite(Arrays.asList("url.getHost()"));
}
} }
@Override @Override
@ -34,38 +70,39 @@ public class ChanRipper extends AbstractHTMLRipper {
@Override @Override
public boolean canRip(URL url) { public boolean canRip(URL url) {
// TODO Whitelist? //explicit_domains testing
if (url.getHost().equals("anon-ib.com")) { for (ChanSite _chanSite : explicit_domains) {
return true; for (String host : _chanSite.domains) {
if (url.getHost().equals(host)) {
return true;
}
}
} }
return url.getHost().contains("chan") && //It'll fail further down the road.
( url.toExternalForm().contains("/res/") // Most chans return url.toExternalForm().contains("/res/") // Most chans
|| url.toExternalForm().contains("/thread/")); // 4chan || url.toExternalForm().contains("/thread/"); // 4chan, archive.moe
} }
/**
* For example the achrives are all known. (Check 4chan-x)
* Should be based on the software the specific chan uses.
* FoolFuuka uses the same (url) layout as 4chan
* */
@Override @Override
public String getGID(URL url) throws MalformedURLException { public String getGID(URL url) throws MalformedURLException {
Pattern p; Matcher m; Pattern p; Matcher m;
String u = url.toExternalForm(); String u = url.toExternalForm();
if (u.contains("/res/")) { if (u.contains("/thread/")||u.contains("/res/")) {
p = Pattern.compile("^.*(chan|anon-ib).*\\.[a-z]{2,3}/[a-zA-Z0-9/]+/res/([0-9]+)(\\.html|\\.php)?.*$"); p = Pattern.compile("^.*\\.[a-z]{1,3}/[a-zA-Z0-9]+/(thread|res)/([0-9]+)(\\.html|\\.php)?.*$");
m = p.matcher(u); m = p.matcher(u);
if (m.matches()) { if (m.matches()) {
return m.group(2); return m.group(2);
} }
} }
else if (u.contains("/thread/")) {
p = Pattern.compile("^.*chan.*\\.[a-z]{2,3}/[a-zA-Z0-9]+/thread/([0-9]+)(\\.html|\\.php)?.*$");
m = p.matcher(u);
if (m.matches()) {
return m.group(1);
}
}
throw new MalformedURLException( throw new MalformedURLException(
"Expected *chan URL formats: " "Expected *chan URL formats: "
+ "*chan.com/@/res/####.html" + ".*/@/(res|thread)/####.html"
+ " Got: " + u); + " Got: " + u);
} }
@ -83,36 +120,47 @@ public class ChanRipper extends AbstractHTMLRipper {
public List<String> getURLsFromPage(Document page) { public List<String> getURLsFromPage(Document page) {
List<String> imageURLs = new ArrayList<String>(); List<String> imageURLs = new ArrayList<String>();
Pattern p; Matcher m; Pattern p; Matcher m;
elementloop:
for (Element link : page.select("a")) { for (Element link : page.select("a")) {
if (!link.hasAttr("href")) { if (!link.hasAttr("href")) {
continue; continue;
} }
if (!link.attr("href").contains("/src/") String href = link.attr("href");
&& !link.attr("href").contains("4cdn.org")) {
logger.debug("Skipping link that does not contain /src/: " + link.attr("href")); //Check all blacklist items
continue; for(String blacklist_item : url_piece_blacklist){
if (href.contains(blacklist_item)){
logger.debug("Skipping link that contains '"+blacklist_item+"': " + href);
continue elementloop;
}
} }
if (link.attr("href").contains("=http") Boolean self_hosted = false;
|| link.attr("href").contains("http://imgops.com/")) { if(!generalChanSite){
logger.debug("Skipping link that contains '=http' or 'imgops.com': " + link.attr("href")); for(String cdnDomain : chanSite.cdnDomains){
continue; if (href.contains(cdnDomain)){
self_hosted = true;
}
}
} }
p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|webm)$", Pattern.CASE_INSENSITIVE); if(self_hosted||generalChanSite){
m = p.matcher(link.attr("href")); p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm)$", Pattern.CASE_INSENSITIVE);
if (m.matches()) { m = p.matcher(href);
String image = link.attr("href"); if (m.matches()) {
if (image.startsWith("//")) { if (href.startsWith("//")) {
image = "http:" + image; href = "http:" + href;
}
if (href.startsWith("/")) {
href = "http://" + this.url.getHost() + href;
}
// Don't download the same URL twice
if (imageURLs.contains(href)) {
logger.debug("Already attempted: " + href);
continue;
}
imageURLs.add(href);
} }
if (image.startsWith("/")) { } else {
image = "http://" + this.url.getHost() + image; //TODO also grab imgur/flickr albums (And all other supported rippers) Maybe add a setting?
}
// Don't download the same URL twice
if (imageURLs.contains(image)) {
logger.debug("Already attempted: " + image);
continue;
}
imageURLs.add(image);
} }
} }
return imageURLs; return imageURLs;
@ -122,5 +170,4 @@ public class ChanRipper extends AbstractHTMLRipper {
public void downloadURL(URL url, int index) { public void downloadURL(URL url, int index) {
addURLToDownload(url, getPrefix(index)); addURLToDownload(url, getPrefix(index));
} }
} }

View File

@ -0,0 +1,35 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package com.rarchives.ripme.ripper.rippers.ripperhelpers;
import java.util.List;
/**
*
* @author Erwin
*/
public class ChanSite {
//The domains where the threads are hosted.
public List<String> domains;
//The domains where the images are hosted.
public List<String> cdnDomains;
public ChanSite(List<String> Domains, List<String> CdnDomains){
if(Domains.isEmpty())
throw new IllegalArgumentException("Domains");
if(CdnDomains.isEmpty())
throw new IllegalArgumentException("CdnDomains");
domains = Domains;
cdnDomains = CdnDomains;
}
public ChanSite(List<String> Domains){
if(Domains.isEmpty())
throw new IllegalArgumentException("Domains");
domains = Domains;
cdnDomains = Domains;
}
}

View File

@ -448,7 +448,7 @@ public class MainWindow implements Runnable, RipStatusHandler {
AbstractRipper ripper = AbstractRipper.getRipper(url); AbstractRipper ripper = AbstractRipper.getRipper(url);
statusWithColor(ripper.getHost() + " album detected", Color.GREEN); statusWithColor(ripper.getHost() + " album detected", Color.GREEN);
} catch (Exception e) { } catch (Exception e) {
statusWithColor("Can't rip this URL", Color.RED); statusWithColor("Can't rip this URL: "+e.getMessage(), Color.RED);
} }
} }
}); });

View File

@ -27,18 +27,20 @@ public class ChanRipperTest extends RippersTest {
List<URL> passURLs = new ArrayList<URL>(); List<URL> passURLs = new ArrayList<URL>();
// URLs that should work // URLs that should work
passURLs.add(new URL("http://desuchan.net/v/res/7034.html")); passURLs.add(new URL("http://desuchan.net/v/res/7034.html"));
passURLs.add(new URL("http://boards.4chan.org/r/res/12225949")); passURLs.add(new URL("http://boards.4chan.org/hr/thread/2214511"));
passURLs.add(new URL("http://fgts.jp/r/thread/12225949/"));
passURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php")); passURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php"));
passURLs.add(new URL("http://7chan.org/gif/res/23795.html")); passURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
passURLs.add(new URL("http://unichan2.org/b/res/518004.html")); passURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
passURLs.add(new URL("http://xchan.pw/porn/res/437.html")); passURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
passURLs.add(new URL("http://archive.moe/c/thread/2295132/"));
for (URL url : passURLs) { for (URL url : passURLs) {
try { try {
ChanRipper ripper = new ChanRipper(url); ChanRipper ripper = new ChanRipper(url);
assert(ripper.canRip(url)); assert(ripper.canRip(url));
deleteDir(ripper.getWorkingDir()); deleteDir(ripper.getWorkingDir());
} catch (Exception e) { } catch (Exception e) {
fail("Failed to instantiate ripper for " + url); fail("Failed to instantiate ripper for " + url + " with message: "+e.toString());
} }
} }
} }
@ -55,6 +57,7 @@ public class ChanRipperTest extends RippersTest {
contentURLs.add(new URL("http://7chan.org/gif/res/23795.html")); contentURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
contentURLs.add(new URL("http://unichan2.org/b/res/518004.html")); contentURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
contentURLs.add(new URL("http://xchan.pw/porn/res/437.html")); contentURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
contentURLs.add(new URL("http://archive.4plebs.org/hr/thread/2215899/"));
for (URL url : contentURLs) { for (URL url : contentURLs) {
try { try {
ChanRipper ripper = new ChanRipper(url); ChanRipper ripper = new ChanRipper(url);