Made ChanRipper more universal.
Added a nice way to add extra chan sites. This makes sure that the files are the correct ones (self_hosted). Generic sites still work. Check http://www.allchans.org/ sometime. And 4chan-x for the list of archives. The "Can't rip this url" error now gives the message. Added ChanSite Helper class. Updates ChanRipperTest urls. testVineboxAlbums is still failing.
This commit is contained in:
parent
e46e6733fa
commit
972c1dc75f
@ -12,12 +12,48 @@ import org.jsoup.nodes.Document;
|
|||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
|
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
|
||||||
|
import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite;
|
||||||
import com.rarchives.ripme.utils.Http;
|
import com.rarchives.ripme.utils.Http;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
public class ChanRipper extends AbstractHTMLRipper {
|
public class ChanRipper extends AbstractHTMLRipper {
|
||||||
|
|
||||||
|
//ArrayList<String> explicit_domains = new ArrayList<String>();
|
||||||
|
public static List<ChanSite> explicit_domains = Arrays.asList(
|
||||||
|
//Tested (main boards)
|
||||||
|
//Untested (main boards)
|
||||||
|
new ChanSite(Arrays.asList("anon-ib.com")),
|
||||||
|
new ChanSite(Arrays.asList("boards.4chan.org"),Arrays.asList("4cdn.org")),
|
||||||
|
//Tested (archives)
|
||||||
|
new ChanSite(Arrays.asList("archive.moe"),Arrays.asList("data.archive.moe")), //4chan archive (successor of foolz archive) Archives: [ a / biz / c / co / diy / gd / i / int / jp / m / mlp / out / po / q / s4s / sci / sp / tg / tv / v / vg / vp / vr / wsg ]
|
||||||
|
//Untested (archives)new ChanSite(Arrays.asList("anon-ib.com")),
|
||||||
|
new ChanSite(Arrays.asList("4archive.org"),Arrays.asList("imgur.com")), //4chan archive (on demand)
|
||||||
|
new ChanSite(Arrays.asList("archive.4plebs.org"),Arrays.asList("img.4plebs.org")), //4chan archive Archives: [ adv / f / hr / o / pol / s4s / tg / trv / tv / x ] Boards: [ plebs ]
|
||||||
|
new ChanSite(Arrays.asList("fgts.jp"),Arrays.asList("dat.fgts.jp")) //4chan archive Archives: [ asp / cm / h / hc / hm / n / p / r / s / soc / y ]
|
||||||
|
);
|
||||||
|
public static List<String> url_piece_blacklist = Arrays.asList(
|
||||||
|
"=http",
|
||||||
|
"http://imgops.com/",
|
||||||
|
"iqdb.org",
|
||||||
|
"saucenao.com"
|
||||||
|
);
|
||||||
|
|
||||||
|
public ChanSite chanSite;
|
||||||
|
public Boolean generalChanSite = true;
|
||||||
|
|
||||||
public ChanRipper(URL url) throws IOException {
|
public ChanRipper(URL url) throws IOException {
|
||||||
super(url);
|
super(url);
|
||||||
|
for (ChanSite _chanSite : explicit_domains) {
|
||||||
|
for (String host : _chanSite.domains) {
|
||||||
|
if (url.getHost().equals(host)) {
|
||||||
|
chanSite = _chanSite;
|
||||||
|
generalChanSite = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(chanSite==null){
|
||||||
|
chanSite = new ChanSite(Arrays.asList("url.getHost()"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -34,38 +70,39 @@ public class ChanRipper extends AbstractHTMLRipper {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean canRip(URL url) {
|
public boolean canRip(URL url) {
|
||||||
// TODO Whitelist?
|
//explicit_domains testing
|
||||||
if (url.getHost().equals("anon-ib.com")) {
|
for (ChanSite _chanSite : explicit_domains) {
|
||||||
return true;
|
for (String host : _chanSite.domains) {
|
||||||
|
if (url.getHost().equals(host)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return url.getHost().contains("chan") &&
|
//It'll fail further down the road.
|
||||||
( url.toExternalForm().contains("/res/") // Most chans
|
return url.toExternalForm().contains("/res/") // Most chans
|
||||||
|| url.toExternalForm().contains("/thread/")); // 4chan
|
|| url.toExternalForm().contains("/thread/"); // 4chan, archive.moe
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* For example the achrives are all known. (Check 4chan-x)
|
||||||
|
* Should be based on the software the specific chan uses.
|
||||||
|
* FoolFuuka uses the same (url) layout as 4chan
|
||||||
|
* */
|
||||||
@Override
|
@Override
|
||||||
public String getGID(URL url) throws MalformedURLException {
|
public String getGID(URL url) throws MalformedURLException {
|
||||||
Pattern p; Matcher m;
|
Pattern p; Matcher m;
|
||||||
|
|
||||||
String u = url.toExternalForm();
|
String u = url.toExternalForm();
|
||||||
if (u.contains("/res/")) {
|
if (u.contains("/thread/")||u.contains("/res/")) {
|
||||||
p = Pattern.compile("^.*(chan|anon-ib).*\\.[a-z]{2,3}/[a-zA-Z0-9/]+/res/([0-9]+)(\\.html|\\.php)?.*$");
|
p = Pattern.compile("^.*\\.[a-z]{1,3}/[a-zA-Z0-9]+/(thread|res)/([0-9]+)(\\.html|\\.php)?.*$");
|
||||||
m = p.matcher(u);
|
m = p.matcher(u);
|
||||||
if (m.matches()) {
|
if (m.matches()) {
|
||||||
return m.group(2);
|
return m.group(2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (u.contains("/thread/")) {
|
|
||||||
p = Pattern.compile("^.*chan.*\\.[a-z]{2,3}/[a-zA-Z0-9]+/thread/([0-9]+)(\\.html|\\.php)?.*$");
|
|
||||||
m = p.matcher(u);
|
|
||||||
if (m.matches()) {
|
|
||||||
return m.group(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
throw new MalformedURLException(
|
throw new MalformedURLException(
|
||||||
"Expected *chan URL formats: "
|
"Expected *chan URL formats: "
|
||||||
+ "*chan.com/@/res/####.html"
|
+ ".*/@/(res|thread)/####.html"
|
||||||
+ " Got: " + u);
|
+ " Got: " + u);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -83,36 +120,47 @@ public class ChanRipper extends AbstractHTMLRipper {
|
|||||||
public List<String> getURLsFromPage(Document page) {
|
public List<String> getURLsFromPage(Document page) {
|
||||||
List<String> imageURLs = new ArrayList<String>();
|
List<String> imageURLs = new ArrayList<String>();
|
||||||
Pattern p; Matcher m;
|
Pattern p; Matcher m;
|
||||||
|
elementloop:
|
||||||
for (Element link : page.select("a")) {
|
for (Element link : page.select("a")) {
|
||||||
if (!link.hasAttr("href")) {
|
if (!link.hasAttr("href")) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!link.attr("href").contains("/src/")
|
String href = link.attr("href");
|
||||||
&& !link.attr("href").contains("4cdn.org")) {
|
|
||||||
logger.debug("Skipping link that does not contain /src/: " + link.attr("href"));
|
//Check all blacklist items
|
||||||
continue;
|
for(String blacklist_item : url_piece_blacklist){
|
||||||
|
if (href.contains(blacklist_item)){
|
||||||
|
logger.debug("Skipping link that contains '"+blacklist_item+"': " + href);
|
||||||
|
continue elementloop;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (link.attr("href").contains("=http")
|
Boolean self_hosted = false;
|
||||||
|| link.attr("href").contains("http://imgops.com/")) {
|
if(!generalChanSite){
|
||||||
logger.debug("Skipping link that contains '=http' or 'imgops.com': " + link.attr("href"));
|
for(String cdnDomain : chanSite.cdnDomains){
|
||||||
continue;
|
if (href.contains(cdnDomain)){
|
||||||
|
self_hosted = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|webm)$", Pattern.CASE_INSENSITIVE);
|
if(self_hosted||generalChanSite){
|
||||||
m = p.matcher(link.attr("href"));
|
p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm)$", Pattern.CASE_INSENSITIVE);
|
||||||
if (m.matches()) {
|
m = p.matcher(href);
|
||||||
String image = link.attr("href");
|
if (m.matches()) {
|
||||||
if (image.startsWith("//")) {
|
if (href.startsWith("//")) {
|
||||||
image = "http:" + image;
|
href = "http:" + href;
|
||||||
|
}
|
||||||
|
if (href.startsWith("/")) {
|
||||||
|
href = "http://" + this.url.getHost() + href;
|
||||||
|
}
|
||||||
|
// Don't download the same URL twice
|
||||||
|
if (imageURLs.contains(href)) {
|
||||||
|
logger.debug("Already attempted: " + href);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
imageURLs.add(href);
|
||||||
}
|
}
|
||||||
if (image.startsWith("/")) {
|
} else {
|
||||||
image = "http://" + this.url.getHost() + image;
|
//TODO also grab imgur/flickr albums (And all other supported rippers) Maybe add a setting?
|
||||||
}
|
|
||||||
// Don't download the same URL twice
|
|
||||||
if (imageURLs.contains(image)) {
|
|
||||||
logger.debug("Already attempted: " + image);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
imageURLs.add(image);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return imageURLs;
|
return imageURLs;
|
||||||
@ -122,5 +170,4 @@ public class ChanRipper extends AbstractHTMLRipper {
|
|||||||
public void downloadURL(URL url, int index) {
|
public void downloadURL(URL url, int index) {
|
||||||
addURLToDownload(url, getPrefix(index));
|
addURLToDownload(url, getPrefix(index));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -0,0 +1,35 @@
|
|||||||
|
/*
|
||||||
|
* To change this license header, choose License Headers in Project Properties.
|
||||||
|
* To change this template file, choose Tools | Templates
|
||||||
|
* and open the template in the editor.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.rarchives.ripme.ripper.rippers.ripperhelpers;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @author Erwin
|
||||||
|
*/
|
||||||
|
public class ChanSite {
|
||||||
|
//The domains where the threads are hosted.
|
||||||
|
public List<String> domains;
|
||||||
|
//The domains where the images are hosted.
|
||||||
|
public List<String> cdnDomains;
|
||||||
|
|
||||||
|
public ChanSite(List<String> Domains, List<String> CdnDomains){
|
||||||
|
if(Domains.isEmpty())
|
||||||
|
throw new IllegalArgumentException("Domains");
|
||||||
|
if(CdnDomains.isEmpty())
|
||||||
|
throw new IllegalArgumentException("CdnDomains");
|
||||||
|
domains = Domains;
|
||||||
|
cdnDomains = CdnDomains;
|
||||||
|
}
|
||||||
|
public ChanSite(List<String> Domains){
|
||||||
|
if(Domains.isEmpty())
|
||||||
|
throw new IllegalArgumentException("Domains");
|
||||||
|
domains = Domains;
|
||||||
|
cdnDomains = Domains;
|
||||||
|
}
|
||||||
|
}
|
@ -448,7 +448,7 @@ public class MainWindow implements Runnable, RipStatusHandler {
|
|||||||
AbstractRipper ripper = AbstractRipper.getRipper(url);
|
AbstractRipper ripper = AbstractRipper.getRipper(url);
|
||||||
statusWithColor(ripper.getHost() + " album detected", Color.GREEN);
|
statusWithColor(ripper.getHost() + " album detected", Color.GREEN);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
statusWithColor("Can't rip this URL", Color.RED);
|
statusWithColor("Can't rip this URL: "+e.getMessage(), Color.RED);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -27,18 +27,20 @@ public class ChanRipperTest extends RippersTest {
|
|||||||
List<URL> passURLs = new ArrayList<URL>();
|
List<URL> passURLs = new ArrayList<URL>();
|
||||||
// URLs that should work
|
// URLs that should work
|
||||||
passURLs.add(new URL("http://desuchan.net/v/res/7034.html"));
|
passURLs.add(new URL("http://desuchan.net/v/res/7034.html"));
|
||||||
passURLs.add(new URL("http://boards.4chan.org/r/res/12225949"));
|
passURLs.add(new URL("http://boards.4chan.org/hr/thread/2214511"));
|
||||||
|
passURLs.add(new URL("http://fgts.jp/r/thread/12225949/"));
|
||||||
passURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php"));
|
passURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php"));
|
||||||
passURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
|
passURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
|
||||||
passURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
|
passURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
|
||||||
passURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
|
passURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
|
||||||
|
passURLs.add(new URL("http://archive.moe/c/thread/2295132/"));
|
||||||
for (URL url : passURLs) {
|
for (URL url : passURLs) {
|
||||||
try {
|
try {
|
||||||
ChanRipper ripper = new ChanRipper(url);
|
ChanRipper ripper = new ChanRipper(url);
|
||||||
assert(ripper.canRip(url));
|
assert(ripper.canRip(url));
|
||||||
deleteDir(ripper.getWorkingDir());
|
deleteDir(ripper.getWorkingDir());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
fail("Failed to instantiate ripper for " + url);
|
fail("Failed to instantiate ripper for " + url + " with message: "+e.toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -55,6 +57,7 @@ public class ChanRipperTest extends RippersTest {
|
|||||||
contentURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
|
contentURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
|
||||||
contentURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
|
contentURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
|
||||||
contentURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
|
contentURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
|
||||||
|
contentURLs.add(new URL("http://archive.4plebs.org/hr/thread/2215899/"));
|
||||||
for (URL url : contentURLs) {
|
for (URL url : contentURLs) {
|
||||||
try {
|
try {
|
||||||
ChanRipper ripper = new ChanRipper(url);
|
ChanRipper ripper = new ChanRipper(url);
|
||||||
|
Loading…
Reference in New Issue
Block a user