Added motherless support, various unit test changes

Motherless ripper gets images and videos. Unit tests have flag to avoid downloading content on every execution.
2014-03-06 23:41:49 -08:00 · 2014-03-06 23:41:49 -08:00 · 76d27fd199
commit 76d27fd199
parent 4a47cc650e
8 changed files with 171 additions and 1 deletions
--- a/src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java
+++ b/src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java
@ -14,8 +14,16 @@ public class DownloadThreadPool {
    private ExecutorService threadPool = null;

    public DownloadThreadPool() {
+        initialize("Main");
+    }
+    
+    public DownloadThreadPool(String threadPoolName) {
+        initialize(threadPoolName);
+    }
+
+    private void initialize(String threadPoolName) {
        int threads = Utils.getConfigInteger("threads.size", 10);
-        logger.debug("Initializing thread pool with " + threads + " threads");
+        logger.debug("Initializing " + threadPoolName + " thread pool with " + threads + " threads");
        threadPool = Executors.newFixedThreadPool(threads);
    }

--- a/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java
@ -143,6 +143,9 @@ public class ImgurRipper extends AbstractRipper {
            }
        }

+        // TODO If album is empty, use this to check for cached images:
+        // http://i.rarchives.com/search.cgi?cache=http://imgur.com/a/albumID
+        // At the least, get the thumbnails.
        logger.info("[!] Falling back to elemental retrieval method");

        // Fall back to parsing HTML elements
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/MotherlessRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/MotherlessRipper.java
@ -0,0 +1,111 @@
+package com.rarchives.ripme.ripper.rippers;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+
+import com.rarchives.ripme.ripper.AbstractRipper;
+import com.rarchives.ripme.ripper.DownloadThreadPool;
+
+public class MotherlessRipper extends AbstractRipper {
+
+    private static final String DOMAIN = "motherless.com",
+                                HOST   = "motherless";
+    private static final Logger logger = Logger.getLogger(MotherlessRipper.class);
+    private static final String USER_AGENT = 
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:27.0) Gecko/20100101 Firefox/27.0";
+
+    private DownloadThreadPool motherlessThreadPool;
+
+    public MotherlessRipper(URL url) throws IOException {
+        super(url);
+        motherlessThreadPool = new DownloadThreadPool();
+    }
+
+    @Override
+    public boolean canRip(URL url) {
+        return url.getHost().endsWith(DOMAIN);
+    }
+
+    @Override
+    public String getHost() {
+        return HOST;
+    }
+
+    @Override
+    public URL sanitizeURL(URL url) throws MalformedURLException {
+        String gid = getGID(url);
+        URL newURL = new URL("http://motherless.com/G" + gid);
+        logger.debug("Sanitized URL from " + url + " to " + newURL);
+        return newURL;
+    }
+
+    @Override
+    public String getGID(URL url) throws MalformedURLException {
+        Pattern p = Pattern.compile("^https?://(www\\.)?motherless\\.com/G([A-Z0-9]{6,8}).*$");
+        Matcher m = p.matcher(url.toExternalForm());
+        if (!m.matches()) {
+            throw new MalformedURLException("Expected URL format: http://motherless.com/GXXXXXXXX");
+        }
+        return m.group(m.groupCount());
+    }
+
+    @Override
+    public void rip() throws IOException {
+        int index = 0;
+        logger.info("[ ] Retrieving " + this.url.toExternalForm());
+        Document doc = Jsoup.connect(this.url.toExternalForm())
+                            .userAgent(USER_AGENT)
+                            .get();
+        for (Element thumb : doc.select("div.thumb a.img-container")) {
+            URL url = new URL("http://" + DOMAIN + thumb.attr("href"));
+            index += 1;
+            // Create thread for finding image at "url" page
+            MotherlessImageThread mit = new MotherlessImageThread(url, index);
+            motherlessThreadPool.addThread(mit);
+        }
+        motherlessThreadPool.waitForThreads();
+        waitForThreads();
+    }
+
+    /**
+     * Helper class to find and download images found on "image" pages
+     */
+    private class MotherlessImageThread extends Thread {
+        private URL url;
+        private int index;
+
+        public MotherlessImageThread(URL url, int index) {
+            super();
+            this.url = url;
+            this.index = index;
+        }
+
+        @Override
+        public void run() {
+            try {
+                Document doc = Jsoup.connect(this.url.toExternalForm())
+                                    .userAgent(USER_AGENT)
+                                    .get();
+                Pattern p = Pattern.compile("^.*__fileurl = '([^']{1,})';.*$", Pattern.DOTALL);
+                Matcher m = p.matcher(doc.outerHtml());
+                if (m.matches()) {
+                    String file = m.group(1);
+                    addURLToDownload(new URL(file), String.format("%03d_", index));
+                } else {
+                    logger.warn("[!] could not find '__fileurl' at " + url);
+                }
+            } catch (IOException e) {
+                logger.error("[!] Exception while loading/parsing " + this.url, e);
+            }
+        }
+    }
+
+}
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ImgurRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ImgurRipperTest.java
@ -55,6 +55,9 @@ public class ImgurRipperTest extends RippersTest {
    }

    public void testImgurAlbums() throws IOException {
+        if (!DOWNLOAD_CONTENT) {
+            return;
+        }
        List<URL> contentURLs = new ArrayList<URL>();
        // URLs that should return more than 1 image
        contentURLs.add(new URL("http://imgur.com/a/hqJIu")); // Vertical layout
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java
@ -10,6 +10,9 @@ import com.rarchives.ripme.ripper.rippers.InstagramRipper;
 public class InstagramRipperTest extends RippersTest {
    
    public void testInstagramAlbums() throws IOException {
+        if (!DOWNLOAD_CONTENT) {
+            return;
+        }
        List<URL> contentURLs = new ArrayList<URL>();
        contentURLs.add(new URL("http://instagram.com/feelgoodincc#"));
        for (URL url : contentURLs) {
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/MotherlessRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/MotherlessRipperTest.java
@ -0,0 +1,36 @@
+package com.rarchives.ripme.tst.ripper.rippers;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import com.rarchives.ripme.ripper.rippers.MotherlessRipper;
+
+public class MotherlessRipperTest extends RippersTest {
+    
+    public void testMotherlessAlbums() throws IOException {
+        if (!DOWNLOAD_CONTENT) {
+            return;
+        }
+        List<URL> contentURLs = new ArrayList<URL>();
+
+        // Image album
+        contentURLs.add(new URL("http://motherless.com/G4DAA18D"));
+        // Video album
+        contentURLs.add(new URL("http://motherless.com/GFD0F537"));
+
+        for (URL url : contentURLs) {
+            try {
+                MotherlessRipper ripper = new MotherlessRipper(url);
+                ripper.rip();
+                assert(ripper.getWorkingDir().listFiles().length > 1);
+                deleteDir(ripper.getWorkingDir());
+            } catch (Exception e) {
+                e.printStackTrace();
+                fail("Error while ripping URL " + url + ": " + e.getMessage());
+            }
+        }
+    }
+
+}
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java
@ -6,6 +6,9 @@ import junit.framework.TestCase;

 public class RippersTest extends TestCase {

+    // Flag for avoiding downloading content with every unit test
+    public final boolean DOWNLOAD_CONTENT = false;
+
    public void testNothing() {
        // Avoid complaints about no test cases in this file.
        assert(true);
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java
@ -10,6 +10,9 @@ import com.rarchives.ripme.ripper.rippers.TwitterRipper;
 public class TwitterRipperTest extends RippersTest {

    public void testTwitterAlbums() throws IOException {
+        if (!DOWNLOAD_CONTENT) {
+            return;
+        }
        List<URL> contentURLs = new ArrayList<URL>();
        //contentURLs.add(new URL("https://twitter.com/danngamber01/media"));
        contentURLs.add(new URL("https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd"));