diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java index c580690a..f44aab43 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java @@ -17,10 +17,16 @@ import org.jsoup.nodes.Element; public class ChanRipper extends AbstractHTMLRipper { private static List explicit_domains = Arrays.asList( - new ChanSite(Arrays.asList("boards.4chan.org"), Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")), - new ChanSite(Arrays.asList("4archive.org"), Arrays.asList("imgur.com")), - new ChanSite(Arrays.asList("archive.4plebs.org"), Arrays.asList("img.4plebs.org")), - new ChanSite(Arrays.asList("yuki.la"), Arrays.asList("55chan.org")) + new ChanSite("boards.4chan.org", Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")), + new ChanSite("4archive.org", "imgur.com"), + new ChanSite("archive.4plebs.org", "img.4plebs.org"), + new ChanSite("yuki.la", "ii.yuki.la"), + new ChanSite("55chan.org"), + new ChanSite("desuchan.net"), + new ChanSite("boards.420chan.org"), + new ChanSite("7chan.org"), + new ChanSite("desuarchive.org", "desu-usergeneratedcontent.xyz"), + new ChanSite("8ch.net", "media.8ch.net") ); private static List url_piece_blacklist = Arrays.asList( @@ -85,27 +91,6 @@ public class ChanRipper extends AbstractHTMLRipper { } } - if (url.toExternalForm().contains("desuchan.net") && url.toExternalForm().contains("/res/")) { - return true; - } - if (url.toExternalForm().contains("boards.420chan.org") && url.toExternalForm().contains("/res/")) { - return true; - } - if (url.toExternalForm().contains("7chan.org") && url.toExternalForm().contains("/res/")) { - return true; - } - if (url.toExternalForm().contains("xchan.pw") && url.toExternalForm().contains("/board/")) { - return true; - } - if (url.toExternalForm().contains("desuarchive.org")) { - return true; - } - if (url.toExternalForm().contains("8ch.net") && url.toExternalForm().contains("/res/")) { - return true; - } - if (url.toExternalForm().contains("55chan.org") && url.toExternalForm().contains("/res/")) { - return true; - } return false; } @@ -209,7 +194,7 @@ public class ChanRipper extends AbstractHTMLRipper { } if (self_hosted || generalChanSite) { - p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm)$", Pattern.CASE_INSENSITIVE); + p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm|mp4)$", Pattern.CASE_INSENSITIVE); m = p.matcher(href); if (m.matches()) { if (href.startsWith("//")) { diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ripperhelpers/ChanSite.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ripperhelpers/ChanSite.java index 4b4dd87f..a427123f 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ripperhelpers/ChanSite.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ripperhelpers/ChanSite.java @@ -1,5 +1,6 @@ package com.rarchives.ripme.ripper.rippers.ripperhelpers; +import java.util.Arrays; import java.util.List; public class ChanSite { @@ -19,6 +20,36 @@ public class ChanSite { cdnDomains = CdnDomains; } + public ChanSite(String Domain, List CdnDomains) { + if (Domain.isEmpty()) { + throw new IllegalArgumentException("Domains"); + } + if (CdnDomains.isEmpty()) { + throw new IllegalArgumentException("CdnDomains"); + } + domains = Arrays.asList(Domain); + cdnDomains = CdnDomains; + } + + public ChanSite(String Domain, String CdnDomain) { + if (Domain.isEmpty()) { + throw new IllegalArgumentException("Domains"); + } + if (CdnDomain.isEmpty()) { + throw new IllegalArgumentException("CdnDomains"); + } + domains = Arrays.asList(Domain); + cdnDomains = Arrays.asList(CdnDomain); + } + + public ChanSite(String Domain) { + if (Domain.isEmpty()) { + throw new IllegalArgumentException("Domains"); + } + domains = Arrays.asList(Domain); + cdnDomains = Arrays.asList(Domain); + } + public ChanSite(List Domains) { if (Domains.isEmpty()) { throw new IllegalArgumentException("Domains"); diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java index 6d45f8cf..93993abd 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java @@ -1,11 +1,14 @@ package com.rarchives.ripme.tst.ripper.rippers; import java.io.IOException; +import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import com.rarchives.ripme.ripper.rippers.ChanRipper; +import com.rarchives.ripme.utils.Http; +import org.jsoup.nodes.Document; public class ChanRipperTest extends RippersTest { @@ -29,7 +32,6 @@ public class ChanRipperTest extends RippersTest { passURLs.add(new URL("https://boards.4chan.org/hr/thread/3015701")); passURLs.add(new URL("https://boards.420chan.org/420/res/232066.php")); passURLs.add(new URL("http://7chan.org/gif/res/25873.html")); - passURLs.add(new URL("https://xchan.pw/board/porn/thread/874116/")); for (URL url : passURLs) { ChanRipper ripper = new ChanRipper(url); ripper.setup(); @@ -42,24 +44,26 @@ public class ChanRipperTest extends RippersTest { public void testChanRipper() throws IOException { List contentURLs = new ArrayList<>(); - // URLs that should return more than 1 image - //contentURLs.add(new URL("http://desuchan.net/v/res/7034.html")); - //contentURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php")); - //contentURLs.add(new URL("http://archive.4plebs.org/s4s/thread/3005257/")); - //contentURLs.add(new URL("http://drawchan.net/dc/dw/res/114910.html")); - - // Most *chans have volatile threads & can't be trusted for integration testing. - - //contentURLs.add(new URL("http://boards.4chan.org/r/res/12225949")); - //contentURLs.add(new URL("http://7chan.org/gif/res/23795.html")); - //contentURLs.add(new URL("http://unichan2.org/b/res/518004.html")); - - // xchan has an HTTPS certificaiton error... - //contentURLs.add(new URL("http://xchan.pw/porn/res/437.html")); + contentURLs.add(new URL(getRandomThreadDesuarchive())); for (URL url : contentURLs) { ChanRipper ripper = new ChanRipper(url); - testRipper(ripper); + testChanRipper(ripper); } } + /** + * + * @return String returns a url to a active desuarchive.org tread as a string + */ + public String getRandomThreadDesuarchive() { + try { + Document doc = Http.url(new URL("https://desuarchive.org/wsg/")).get(); + System.out.println(doc); + return doc.select("div.post_data > a").first().attr("href"); + } catch (IOException e) { + e.printStackTrace(); + } + return null; + } + } diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java index 46e5f4b5..a32cc752 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java @@ -2,7 +2,9 @@ package com.rarchives.ripme.tst.ripper.rippers; import java.io.File; import java.io.IOException; +import java.util.List; +import com.rarchives.ripme.ripper.rippers.ChanRipper; import junit.framework.TestCase; import org.apache.log4j.ConsoleAppender; @@ -52,6 +54,38 @@ public class RippersTest extends TestCase { } } + // We have a special test for chan rippers because we can't assume that content will be downloadable, as content + // is often removed within mere hours of it being posted. So instead of trying to download any content we just check + // that we found links to it + void testChanRipper(ChanRipper ripper) { + try { + // Turn on Debug logging + ((ConsoleAppender)Logger.getRootLogger().getAppender("stdout")).setThreshold(Level.DEBUG); + + // Decrease timeout + Utils.setConfigInteger("page.timeout", 20 * 1000); + + ripper.setup(); + ripper.markAsTest(); + List foundUrls = ripper.getURLsFromPage(ripper.getFirstPage()); + assertTrue("Failed to find single url on page " + ripper.getURL(), foundUrls.size() >= 1); + } catch (IOException e) { + if (e.getMessage().contains("Ripping interrupted")) { + // We expect some rips to get interrupted + } + else { + e.printStackTrace(); + fail("Failed to rip " + ripper.getURL() + " : " + e.getMessage()); + } + } catch (Exception e) { + e.printStackTrace(); + fail("Failed to rip " + ripper.getURL() + " : " + e.getMessage()); + } + finally { + deleteDir(ripper.getWorkingDir()); + } + } + /** File extensions that are safe to delete. */ private static final String[] SAFE_EXTENSIONS = {"png", "jpg", "jpeg", "gif",