Made ChanRipper more universal.
Added a nice way to add extra chan sites. This makes sure that the files are the correct ones (self_hosted). Generic sites still work. Check http://www.allchans.org/ sometime. And 4chan-x for the list of archives. The "Can't rip this url" error now gives the message. Added ChanSite Helper class. Updates ChanRipperTest urls. testVineboxAlbums is still failing.
This commit is contained in:
parent
e46e6733fa
commit
972c1dc75f
@ -12,12 +12,48 @@ import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
|
||||
import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite;
|
||||
import com.rarchives.ripme.utils.Http;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class ChanRipper extends AbstractHTMLRipper {
|
||||
|
||||
|
||||
//ArrayList<String> explicit_domains = new ArrayList<String>();
|
||||
public static List<ChanSite> explicit_domains = Arrays.asList(
|
||||
//Tested (main boards)
|
||||
//Untested (main boards)
|
||||
new ChanSite(Arrays.asList("anon-ib.com")),
|
||||
new ChanSite(Arrays.asList("boards.4chan.org"),Arrays.asList("4cdn.org")),
|
||||
//Tested (archives)
|
||||
new ChanSite(Arrays.asList("archive.moe"),Arrays.asList("data.archive.moe")), //4chan archive (successor of foolz archive) Archives: [ a / biz / c / co / diy / gd / i / int / jp / m / mlp / out / po / q / s4s / sci / sp / tg / tv / v / vg / vp / vr / wsg ]
|
||||
//Untested (archives)new ChanSite(Arrays.asList("anon-ib.com")),
|
||||
new ChanSite(Arrays.asList("4archive.org"),Arrays.asList("imgur.com")), //4chan archive (on demand)
|
||||
new ChanSite(Arrays.asList("archive.4plebs.org"),Arrays.asList("img.4plebs.org")), //4chan archive Archives: [ adv / f / hr / o / pol / s4s / tg / trv / tv / x ] Boards: [ plebs ]
|
||||
new ChanSite(Arrays.asList("fgts.jp"),Arrays.asList("dat.fgts.jp")) //4chan archive Archives: [ asp / cm / h / hc / hm / n / p / r / s / soc / y ]
|
||||
);
|
||||
public static List<String> url_piece_blacklist = Arrays.asList(
|
||||
"=http",
|
||||
"http://imgops.com/",
|
||||
"iqdb.org",
|
||||
"saucenao.com"
|
||||
);
|
||||
|
||||
public ChanSite chanSite;
|
||||
public Boolean generalChanSite = true;
|
||||
|
||||
public ChanRipper(URL url) throws IOException {
|
||||
super(url);
|
||||
for (ChanSite _chanSite : explicit_domains) {
|
||||
for (String host : _chanSite.domains) {
|
||||
if (url.getHost().equals(host)) {
|
||||
chanSite = _chanSite;
|
||||
generalChanSite = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(chanSite==null){
|
||||
chanSite = new ChanSite(Arrays.asList("url.getHost()"));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -33,39 +69,40 @@ public class ChanRipper extends AbstractHTMLRipper {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canRip(URL url) {
|
||||
// TODO Whitelist?
|
||||
if (url.getHost().equals("anon-ib.com")) {
|
||||
return true;
|
||||
public boolean canRip(URL url) {
|
||||
//explicit_domains testing
|
||||
for (ChanSite _chanSite : explicit_domains) {
|
||||
for (String host : _chanSite.domains) {
|
||||
if (url.getHost().equals(host)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return url.getHost().contains("chan") &&
|
||||
( url.toExternalForm().contains("/res/") // Most chans
|
||||
|| url.toExternalForm().contains("/thread/")); // 4chan
|
||||
//It'll fail further down the road.
|
||||
return url.toExternalForm().contains("/res/") // Most chans
|
||||
|| url.toExternalForm().contains("/thread/"); // 4chan, archive.moe
|
||||
}
|
||||
|
||||
/**
|
||||
* For example the achrives are all known. (Check 4chan-x)
|
||||
* Should be based on the software the specific chan uses.
|
||||
* FoolFuuka uses the same (url) layout as 4chan
|
||||
* */
|
||||
@Override
|
||||
public String getGID(URL url) throws MalformedURLException {
|
||||
Pattern p; Matcher m;
|
||||
|
||||
String u = url.toExternalForm();
|
||||
if (u.contains("/res/")) {
|
||||
p = Pattern.compile("^.*(chan|anon-ib).*\\.[a-z]{2,3}/[a-zA-Z0-9/]+/res/([0-9]+)(\\.html|\\.php)?.*$");
|
||||
String u = url.toExternalForm();
|
||||
if (u.contains("/thread/")||u.contains("/res/")) {
|
||||
p = Pattern.compile("^.*\\.[a-z]{1,3}/[a-zA-Z0-9]+/(thread|res)/([0-9]+)(\\.html|\\.php)?.*$");
|
||||
m = p.matcher(u);
|
||||
if (m.matches()) {
|
||||
return m.group(2);
|
||||
}
|
||||
}
|
||||
else if (u.contains("/thread/")) {
|
||||
p = Pattern.compile("^.*chan.*\\.[a-z]{2,3}/[a-zA-Z0-9]+/thread/([0-9]+)(\\.html|\\.php)?.*$");
|
||||
m = p.matcher(u);
|
||||
if (m.matches()) {
|
||||
return m.group(1);
|
||||
}
|
||||
}
|
||||
|
||||
throw new MalformedURLException(
|
||||
"Expected *chan URL formats: "
|
||||
+ "*chan.com/@/res/####.html"
|
||||
+ ".*/@/(res|thread)/####.html"
|
||||
+ " Got: " + u);
|
||||
}
|
||||
|
||||
@ -83,37 +120,48 @@ public class ChanRipper extends AbstractHTMLRipper {
|
||||
public List<String> getURLsFromPage(Document page) {
|
||||
List<String> imageURLs = new ArrayList<String>();
|
||||
Pattern p; Matcher m;
|
||||
elementloop:
|
||||
for (Element link : page.select("a")) {
|
||||
if (!link.hasAttr("href")) {
|
||||
continue;
|
||||
}
|
||||
if (!link.attr("href").contains("/src/")
|
||||
&& !link.attr("href").contains("4cdn.org")) {
|
||||
logger.debug("Skipping link that does not contain /src/: " + link.attr("href"));
|
||||
continue;
|
||||
String href = link.attr("href");
|
||||
|
||||
//Check all blacklist items
|
||||
for(String blacklist_item : url_piece_blacklist){
|
||||
if (href.contains(blacklist_item)){
|
||||
logger.debug("Skipping link that contains '"+blacklist_item+"': " + href);
|
||||
continue elementloop;
|
||||
}
|
||||
}
|
||||
if (link.attr("href").contains("=http")
|
||||
|| link.attr("href").contains("http://imgops.com/")) {
|
||||
logger.debug("Skipping link that contains '=http' or 'imgops.com': " + link.attr("href"));
|
||||
continue;
|
||||
Boolean self_hosted = false;
|
||||
if(!generalChanSite){
|
||||
for(String cdnDomain : chanSite.cdnDomains){
|
||||
if (href.contains(cdnDomain)){
|
||||
self_hosted = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|webm)$", Pattern.CASE_INSENSITIVE);
|
||||
m = p.matcher(link.attr("href"));
|
||||
if (m.matches()) {
|
||||
String image = link.attr("href");
|
||||
if (image.startsWith("//")) {
|
||||
image = "http:" + image;
|
||||
if(self_hosted||generalChanSite){
|
||||
p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm)$", Pattern.CASE_INSENSITIVE);
|
||||
m = p.matcher(href);
|
||||
if (m.matches()) {
|
||||
if (href.startsWith("//")) {
|
||||
href = "http:" + href;
|
||||
}
|
||||
if (href.startsWith("/")) {
|
||||
href = "http://" + this.url.getHost() + href;
|
||||
}
|
||||
// Don't download the same URL twice
|
||||
if (imageURLs.contains(href)) {
|
||||
logger.debug("Already attempted: " + href);
|
||||
continue;
|
||||
}
|
||||
imageURLs.add(href);
|
||||
}
|
||||
if (image.startsWith("/")) {
|
||||
image = "http://" + this.url.getHost() + image;
|
||||
}
|
||||
// Don't download the same URL twice
|
||||
if (imageURLs.contains(image)) {
|
||||
logger.debug("Already attempted: " + image);
|
||||
continue;
|
||||
}
|
||||
imageURLs.add(image);
|
||||
}
|
||||
} else {
|
||||
//TODO also grab imgur/flickr albums (And all other supported rippers) Maybe add a setting?
|
||||
}
|
||||
}
|
||||
return imageURLs;
|
||||
}
|
||||
@ -121,6 +169,5 @@ public class ChanRipper extends AbstractHTMLRipper {
|
||||
@Override
|
||||
public void downloadURL(URL url, int index) {
|
||||
addURLToDownload(url, getPrefix(index));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* To change this license header, choose License Headers in Project Properties.
|
||||
* To change this template file, choose Tools | Templates
|
||||
* and open the template in the editor.
|
||||
*/
|
||||
|
||||
package com.rarchives.ripme.ripper.rippers.ripperhelpers;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Erwin
|
||||
*/
|
||||
public class ChanSite {
|
||||
//The domains where the threads are hosted.
|
||||
public List<String> domains;
|
||||
//The domains where the images are hosted.
|
||||
public List<String> cdnDomains;
|
||||
|
||||
public ChanSite(List<String> Domains, List<String> CdnDomains){
|
||||
if(Domains.isEmpty())
|
||||
throw new IllegalArgumentException("Domains");
|
||||
if(CdnDomains.isEmpty())
|
||||
throw new IllegalArgumentException("CdnDomains");
|
||||
domains = Domains;
|
||||
cdnDomains = CdnDomains;
|
||||
}
|
||||
public ChanSite(List<String> Domains){
|
||||
if(Domains.isEmpty())
|
||||
throw new IllegalArgumentException("Domains");
|
||||
domains = Domains;
|
||||
cdnDomains = Domains;
|
||||
}
|
||||
}
|
@ -448,7 +448,7 @@ public class MainWindow implements Runnable, RipStatusHandler {
|
||||
AbstractRipper ripper = AbstractRipper.getRipper(url);
|
||||
statusWithColor(ripper.getHost() + " album detected", Color.GREEN);
|
||||
} catch (Exception e) {
|
||||
statusWithColor("Can't rip this URL", Color.RED);
|
||||
statusWithColor("Can't rip this URL: "+e.getMessage(), Color.RED);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
@ -27,18 +27,20 @@ public class ChanRipperTest extends RippersTest {
|
||||
List<URL> passURLs = new ArrayList<URL>();
|
||||
// URLs that should work
|
||||
passURLs.add(new URL("http://desuchan.net/v/res/7034.html"));
|
||||
passURLs.add(new URL("http://boards.4chan.org/r/res/12225949"));
|
||||
passURLs.add(new URL("http://boards.4chan.org/hr/thread/2214511"));
|
||||
passURLs.add(new URL("http://fgts.jp/r/thread/12225949/"));
|
||||
passURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php"));
|
||||
passURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
|
||||
passURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
|
||||
passURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
|
||||
passURLs.add(new URL("http://archive.moe/c/thread/2295132/"));
|
||||
for (URL url : passURLs) {
|
||||
try {
|
||||
ChanRipper ripper = new ChanRipper(url);
|
||||
assert(ripper.canRip(url));
|
||||
deleteDir(ripper.getWorkingDir());
|
||||
} catch (Exception e) {
|
||||
fail("Failed to instantiate ripper for " + url);
|
||||
fail("Failed to instantiate ripper for " + url + " with message: "+e.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -55,6 +57,7 @@ public class ChanRipperTest extends RippersTest {
|
||||
contentURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
|
||||
contentURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
|
||||
contentURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
|
||||
contentURLs.add(new URL("http://archive.4plebs.org/hr/thread/2215899/"));
|
||||
for (URL url : contentURLs) {
|
||||
try {
|
||||
ChanRipper ripper = new ChanRipper(url);
|
||||
|
Loading…
Reference in New Issue
Block a user