Merge pull request #993 from cyian-1756/chanRipperCleanUP
Cleaned up chan ripper and removed dead chans
This commit is contained in:
commit
00cab660a4
@ -17,10 +17,16 @@ import org.jsoup.nodes.Element;
|
|||||||
|
|
||||||
public class ChanRipper extends AbstractHTMLRipper {
|
public class ChanRipper extends AbstractHTMLRipper {
|
||||||
private static List<ChanSite> explicit_domains = Arrays.asList(
|
private static List<ChanSite> explicit_domains = Arrays.asList(
|
||||||
new ChanSite(Arrays.asList("boards.4chan.org"), Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")),
|
new ChanSite("boards.4chan.org", Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")),
|
||||||
new ChanSite(Arrays.asList("4archive.org"), Arrays.asList("imgur.com")),
|
new ChanSite("4archive.org", "imgur.com"),
|
||||||
new ChanSite(Arrays.asList("archive.4plebs.org"), Arrays.asList("img.4plebs.org")),
|
new ChanSite("archive.4plebs.org", "img.4plebs.org"),
|
||||||
new ChanSite(Arrays.asList("yuki.la"), Arrays.asList("55chan.org"))
|
new ChanSite("yuki.la", "ii.yuki.la"),
|
||||||
|
new ChanSite("55chan.org"),
|
||||||
|
new ChanSite("desuchan.net"),
|
||||||
|
new ChanSite("boards.420chan.org"),
|
||||||
|
new ChanSite("7chan.org"),
|
||||||
|
new ChanSite("desuarchive.org", "desu-usergeneratedcontent.xyz"),
|
||||||
|
new ChanSite("8ch.net", "media.8ch.net")
|
||||||
);
|
);
|
||||||
|
|
||||||
private static List<String> url_piece_blacklist = Arrays.asList(
|
private static List<String> url_piece_blacklist = Arrays.asList(
|
||||||
@ -85,27 +91,6 @@ public class ChanRipper extends AbstractHTMLRipper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (url.toExternalForm().contains("desuchan.net") && url.toExternalForm().contains("/res/")) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (url.toExternalForm().contains("boards.420chan.org") && url.toExternalForm().contains("/res/")) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (url.toExternalForm().contains("7chan.org") && url.toExternalForm().contains("/res/")) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (url.toExternalForm().contains("xchan.pw") && url.toExternalForm().contains("/board/")) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (url.toExternalForm().contains("desuarchive.org")) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (url.toExternalForm().contains("8ch.net") && url.toExternalForm().contains("/res/")) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (url.toExternalForm().contains("55chan.org") && url.toExternalForm().contains("/res/")) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -209,7 +194,7 @@ public class ChanRipper extends AbstractHTMLRipper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (self_hosted || generalChanSite) {
|
if (self_hosted || generalChanSite) {
|
||||||
p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm)$", Pattern.CASE_INSENSITIVE);
|
p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm|mp4)$", Pattern.CASE_INSENSITIVE);
|
||||||
m = p.matcher(href);
|
m = p.matcher(href);
|
||||||
if (m.matches()) {
|
if (m.matches()) {
|
||||||
if (href.startsWith("//")) {
|
if (href.startsWith("//")) {
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package com.rarchives.ripme.ripper.rippers.ripperhelpers;
|
package com.rarchives.ripme.ripper.rippers.ripperhelpers;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class ChanSite {
|
public class ChanSite {
|
||||||
@ -19,6 +20,36 @@ public class ChanSite {
|
|||||||
cdnDomains = CdnDomains;
|
cdnDomains = CdnDomains;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public ChanSite(String Domain, List<String> CdnDomains) {
|
||||||
|
if (Domain.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Domains");
|
||||||
|
}
|
||||||
|
if (CdnDomains.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("CdnDomains");
|
||||||
|
}
|
||||||
|
domains = Arrays.asList(Domain);
|
||||||
|
cdnDomains = CdnDomains;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ChanSite(String Domain, String CdnDomain) {
|
||||||
|
if (Domain.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Domains");
|
||||||
|
}
|
||||||
|
if (CdnDomain.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("CdnDomains");
|
||||||
|
}
|
||||||
|
domains = Arrays.asList(Domain);
|
||||||
|
cdnDomains = Arrays.asList(CdnDomain);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ChanSite(String Domain) {
|
||||||
|
if (Domain.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Domains");
|
||||||
|
}
|
||||||
|
domains = Arrays.asList(Domain);
|
||||||
|
cdnDomains = Arrays.asList(Domain);
|
||||||
|
}
|
||||||
|
|
||||||
public ChanSite(List<String> Domains) {
|
public ChanSite(List<String> Domains) {
|
||||||
if (Domains.isEmpty()) {
|
if (Domains.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Domains");
|
throw new IllegalArgumentException("Domains");
|
||||||
|
@ -1,11 +1,14 @@
|
|||||||
package com.rarchives.ripme.tst.ripper.rippers;
|
package com.rarchives.ripme.tst.ripper.rippers;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.rarchives.ripme.ripper.rippers.ChanRipper;
|
import com.rarchives.ripme.ripper.rippers.ChanRipper;
|
||||||
|
import com.rarchives.ripme.utils.Http;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
public class ChanRipperTest extends RippersTest {
|
public class ChanRipperTest extends RippersTest {
|
||||||
|
|
||||||
@ -29,7 +32,6 @@ public class ChanRipperTest extends RippersTest {
|
|||||||
passURLs.add(new URL("https://boards.4chan.org/hr/thread/3015701"));
|
passURLs.add(new URL("https://boards.4chan.org/hr/thread/3015701"));
|
||||||
passURLs.add(new URL("https://boards.420chan.org/420/res/232066.php"));
|
passURLs.add(new URL("https://boards.420chan.org/420/res/232066.php"));
|
||||||
passURLs.add(new URL("http://7chan.org/gif/res/25873.html"));
|
passURLs.add(new URL("http://7chan.org/gif/res/25873.html"));
|
||||||
passURLs.add(new URL("https://xchan.pw/board/porn/thread/874116/"));
|
|
||||||
for (URL url : passURLs) {
|
for (URL url : passURLs) {
|
||||||
ChanRipper ripper = new ChanRipper(url);
|
ChanRipper ripper = new ChanRipper(url);
|
||||||
ripper.setup();
|
ripper.setup();
|
||||||
@ -42,24 +44,26 @@ public class ChanRipperTest extends RippersTest {
|
|||||||
|
|
||||||
public void testChanRipper() throws IOException {
|
public void testChanRipper() throws IOException {
|
||||||
List<URL> contentURLs = new ArrayList<>();
|
List<URL> contentURLs = new ArrayList<>();
|
||||||
// URLs that should return more than 1 image
|
contentURLs.add(new URL(getRandomThreadDesuarchive()));
|
||||||
//contentURLs.add(new URL("http://desuchan.net/v/res/7034.html"));
|
|
||||||
//contentURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php"));
|
|
||||||
//contentURLs.add(new URL("http://archive.4plebs.org/s4s/thread/3005257/"));
|
|
||||||
//contentURLs.add(new URL("http://drawchan.net/dc/dw/res/114910.html"));
|
|
||||||
|
|
||||||
// Most *chans have volatile threads & can't be trusted for integration testing.
|
|
||||||
|
|
||||||
//contentURLs.add(new URL("http://boards.4chan.org/r/res/12225949"));
|
|
||||||
//contentURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
|
|
||||||
//contentURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
|
|
||||||
|
|
||||||
// xchan has an HTTPS certificaiton error...
|
|
||||||
//contentURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
|
|
||||||
for (URL url : contentURLs) {
|
for (URL url : contentURLs) {
|
||||||
ChanRipper ripper = new ChanRipper(url);
|
ChanRipper ripper = new ChanRipper(url);
|
||||||
testRipper(ripper);
|
testChanRipper(ripper);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return String returns a url to a active desuarchive.org tread as a string
|
||||||
|
*/
|
||||||
|
public String getRandomThreadDesuarchive() {
|
||||||
|
try {
|
||||||
|
Document doc = Http.url(new URL("https://desuarchive.org/wsg/")).get();
|
||||||
|
System.out.println(doc);
|
||||||
|
return doc.select("div.post_data > a").first().attr("href");
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,9 @@ package com.rarchives.ripme.tst.ripper.rippers;
|
|||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.rarchives.ripme.ripper.rippers.ChanRipper;
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.log4j.ConsoleAppender;
|
import org.apache.log4j.ConsoleAppender;
|
||||||
@ -52,6 +54,38 @@ public class RippersTest extends TestCase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// We have a special test for chan rippers because we can't assume that content will be downloadable, as content
|
||||||
|
// is often removed within mere hours of it being posted. So instead of trying to download any content we just check
|
||||||
|
// that we found links to it
|
||||||
|
void testChanRipper(ChanRipper ripper) {
|
||||||
|
try {
|
||||||
|
// Turn on Debug logging
|
||||||
|
((ConsoleAppender)Logger.getRootLogger().getAppender("stdout")).setThreshold(Level.DEBUG);
|
||||||
|
|
||||||
|
// Decrease timeout
|
||||||
|
Utils.setConfigInteger("page.timeout", 20 * 1000);
|
||||||
|
|
||||||
|
ripper.setup();
|
||||||
|
ripper.markAsTest();
|
||||||
|
List<String> foundUrls = ripper.getURLsFromPage(ripper.getFirstPage());
|
||||||
|
assertTrue("Failed to find single url on page " + ripper.getURL(), foundUrls.size() >= 1);
|
||||||
|
} catch (IOException e) {
|
||||||
|
if (e.getMessage().contains("Ripping interrupted")) {
|
||||||
|
// We expect some rips to get interrupted
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
e.printStackTrace();
|
||||||
|
fail("Failed to rip " + ripper.getURL() + " : " + e.getMessage());
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
fail("Failed to rip " + ripper.getURL() + " : " + e.getMessage());
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
deleteDir(ripper.getWorkingDir());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** File extensions that are safe to delete. */
|
/** File extensions that are safe to delete. */
|
||||||
private static final String[] SAFE_EXTENSIONS =
|
private static final String[] SAFE_EXTENSIONS =
|
||||||
{"png", "jpg", "jpeg", "gif",
|
{"png", "jpg", "jpeg", "gif",
|
||||||
|
Loading…
Reference in New Issue
Block a user