Made ChanRipper more universal.

Added a nice way to add extra chan sites. This makes sure that the files
are the correct ones (self_hosted). Generic sites still work. Check
http://www.allchans.org/ sometime. And 4chan-x for the list of archives.
The "Can't rip this url" error now gives the message.
Added ChanSite Helper class.
Updates ChanRipperTest urls.

testVineboxAlbums is still failing.
This commit is contained in:
Erwin de Haan 2014-09-08 00:36:08 +02:00
parent e46e6733fa
commit 972c1dc75f
4 changed files with 135 additions and 50 deletions

View File

@ -12,12 +12,48 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite;
import com.rarchives.ripme.utils.Http;
import java.util.Arrays;
public class ChanRipper extends AbstractHTMLRipper {
//ArrayList<String> explicit_domains = new ArrayList<String>();
public static List<ChanSite> explicit_domains = Arrays.asList(
//Tested (main boards)
//Untested (main boards)
new ChanSite(Arrays.asList("anon-ib.com")),
new ChanSite(Arrays.asList("boards.4chan.org"),Arrays.asList("4cdn.org")),
//Tested (archives)
new ChanSite(Arrays.asList("archive.moe"),Arrays.asList("data.archive.moe")), //4chan archive (successor of foolz archive) Archives: [ a / biz / c / co / diy / gd / i / int / jp / m / mlp / out / po / q / s4s / sci / sp / tg / tv / v / vg / vp / vr / wsg ]
//Untested (archives)new ChanSite(Arrays.asList("anon-ib.com")),
new ChanSite(Arrays.asList("4archive.org"),Arrays.asList("imgur.com")), //4chan archive (on demand)
new ChanSite(Arrays.asList("archive.4plebs.org"),Arrays.asList("img.4plebs.org")), //4chan archive Archives: [ adv / f / hr / o / pol / s4s / tg / trv / tv / x ] Boards: [ plebs ]
new ChanSite(Arrays.asList("fgts.jp"),Arrays.asList("dat.fgts.jp")) //4chan archive Archives: [ asp / cm / h / hc / hm / n / p / r / s / soc / y ]
);
public static List<String> url_piece_blacklist = Arrays.asList(
"=http",
"http://imgops.com/",
"iqdb.org",
"saucenao.com"
);
public ChanSite chanSite;
public Boolean generalChanSite = true;
public ChanRipper(URL url) throws IOException {
super(url);
for (ChanSite _chanSite : explicit_domains) {
for (String host : _chanSite.domains) {
if (url.getHost().equals(host)) {
chanSite = _chanSite;
generalChanSite = false;
}
}
}
if(chanSite==null){
chanSite = new ChanSite(Arrays.asList("url.getHost()"));
}
}
@Override
@ -34,38 +70,39 @@ public class ChanRipper extends AbstractHTMLRipper {
@Override
public boolean canRip(URL url) {
// TODO Whitelist?
if (url.getHost().equals("anon-ib.com")) {
//explicit_domains testing
for (ChanSite _chanSite : explicit_domains) {
for (String host : _chanSite.domains) {
if (url.getHost().equals(host)) {
return true;
}
return url.getHost().contains("chan") &&
( url.toExternalForm().contains("/res/") // Most chans
|| url.toExternalForm().contains("/thread/")); // 4chan
}
}
//It'll fail further down the road.
return url.toExternalForm().contains("/res/") // Most chans
|| url.toExternalForm().contains("/thread/"); // 4chan, archive.moe
}
/**
* For example the achrives are all known. (Check 4chan-x)
* Should be based on the software the specific chan uses.
* FoolFuuka uses the same (url) layout as 4chan
* */
@Override
public String getGID(URL url) throws MalformedURLException {
Pattern p; Matcher m;
String u = url.toExternalForm();
if (u.contains("/res/")) {
p = Pattern.compile("^.*(chan|anon-ib).*\\.[a-z]{2,3}/[a-zA-Z0-9/]+/res/([0-9]+)(\\.html|\\.php)?.*$");
if (u.contains("/thread/")||u.contains("/res/")) {
p = Pattern.compile("^.*\\.[a-z]{1,3}/[a-zA-Z0-9]+/(thread|res)/([0-9]+)(\\.html|\\.php)?.*$");
m = p.matcher(u);
if (m.matches()) {
return m.group(2);
}
}
else if (u.contains("/thread/")) {
p = Pattern.compile("^.*chan.*\\.[a-z]{2,3}/[a-zA-Z0-9]+/thread/([0-9]+)(\\.html|\\.php)?.*$");
m = p.matcher(u);
if (m.matches()) {
return m.group(1);
}
}
throw new MalformedURLException(
"Expected *chan URL formats: "
+ "*chan.com/@/res/####.html"
+ ".*/@/(res|thread)/####.html"
+ " Got: " + u);
}
@ -83,36 +120,47 @@ public class ChanRipper extends AbstractHTMLRipper {
public List<String> getURLsFromPage(Document page) {
List<String> imageURLs = new ArrayList<String>();
Pattern p; Matcher m;
elementloop:
for (Element link : page.select("a")) {
if (!link.hasAttr("href")) {
continue;
}
if (!link.attr("href").contains("/src/")
&& !link.attr("href").contains("4cdn.org")) {
logger.debug("Skipping link that does not contain /src/: " + link.attr("href"));
continue;
String href = link.attr("href");
//Check all blacklist items
for(String blacklist_item : url_piece_blacklist){
if (href.contains(blacklist_item)){
logger.debug("Skipping link that contains '"+blacklist_item+"': " + href);
continue elementloop;
}
if (link.attr("href").contains("=http")
|| link.attr("href").contains("http://imgops.com/")) {
logger.debug("Skipping link that contains '=http' or 'imgops.com': " + link.attr("href"));
continue;
}
p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|webm)$", Pattern.CASE_INSENSITIVE);
m = p.matcher(link.attr("href"));
Boolean self_hosted = false;
if(!generalChanSite){
for(String cdnDomain : chanSite.cdnDomains){
if (href.contains(cdnDomain)){
self_hosted = true;
}
}
}
if(self_hosted||generalChanSite){
p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm)$", Pattern.CASE_INSENSITIVE);
m = p.matcher(href);
if (m.matches()) {
String image = link.attr("href");
if (image.startsWith("//")) {
image = "http:" + image;
if (href.startsWith("//")) {
href = "http:" + href;
}
if (image.startsWith("/")) {
image = "http://" + this.url.getHost() + image;
if (href.startsWith("/")) {
href = "http://" + this.url.getHost() + href;
}
// Don't download the same URL twice
if (imageURLs.contains(image)) {
logger.debug("Already attempted: " + image);
if (imageURLs.contains(href)) {
logger.debug("Already attempted: " + href);
continue;
}
imageURLs.add(image);
imageURLs.add(href);
}
} else {
//TODO also grab imgur/flickr albums (And all other supported rippers) Maybe add a setting?
}
}
return imageURLs;
@ -122,5 +170,4 @@ public class ChanRipper extends AbstractHTMLRipper {
public void downloadURL(URL url, int index) {
addURLToDownload(url, getPrefix(index));
}
}

View File

@ -0,0 +1,35 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package com.rarchives.ripme.ripper.rippers.ripperhelpers;
import java.util.List;
/**
*
* @author Erwin
*/
public class ChanSite {
//The domains where the threads are hosted.
public List<String> domains;
//The domains where the images are hosted.
public List<String> cdnDomains;
public ChanSite(List<String> Domains, List<String> CdnDomains){
if(Domains.isEmpty())
throw new IllegalArgumentException("Domains");
if(CdnDomains.isEmpty())
throw new IllegalArgumentException("CdnDomains");
domains = Domains;
cdnDomains = CdnDomains;
}
public ChanSite(List<String> Domains){
if(Domains.isEmpty())
throw new IllegalArgumentException("Domains");
domains = Domains;
cdnDomains = Domains;
}
}

View File

@ -448,7 +448,7 @@ public class MainWindow implements Runnable, RipStatusHandler {
AbstractRipper ripper = AbstractRipper.getRipper(url);
statusWithColor(ripper.getHost() + " album detected", Color.GREEN);
} catch (Exception e) {
statusWithColor("Can't rip this URL", Color.RED);
statusWithColor("Can't rip this URL: "+e.getMessage(), Color.RED);
}
}
});

View File

@ -27,18 +27,20 @@ public class ChanRipperTest extends RippersTest {
List<URL> passURLs = new ArrayList<URL>();
// URLs that should work
passURLs.add(new URL("http://desuchan.net/v/res/7034.html"));
passURLs.add(new URL("http://boards.4chan.org/r/res/12225949"));
passURLs.add(new URL("http://boards.4chan.org/hr/thread/2214511"));
passURLs.add(new URL("http://fgts.jp/r/thread/12225949/"));
passURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php"));
passURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
passURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
passURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
passURLs.add(new URL("http://archive.moe/c/thread/2295132/"));
for (URL url : passURLs) {
try {
ChanRipper ripper = new ChanRipper(url);
assert(ripper.canRip(url));
deleteDir(ripper.getWorkingDir());
} catch (Exception e) {
fail("Failed to instantiate ripper for " + url);
fail("Failed to instantiate ripper for " + url + " with message: "+e.toString());
}
}
}
@ -55,6 +57,7 @@ public class ChanRipperTest extends RippersTest {
contentURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
contentURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
contentURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
contentURLs.add(new URL("http://archive.4plebs.org/hr/thread/2215899/"));
for (URL url : contentURLs) {
try {
ChanRipper ripper = new ChanRipper(url);