Added instagram ripper, integration tests.

Also fixed parts of the imgur ripper.
2014-03-03 00:44:07 -08:00 · 2014-03-03 00:44:07 -08:00 · e2bb412d9f
commit e2bb412d9f
parent c5c55055c2
7 changed files with 296 additions and 9 deletions
--- a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java
@ -178,7 +178,6 @@ public abstract class AbstractRipper
    }

    private void checkIfComplete() {
-        System.err.println("Pending: " + itemsPending.size() + ", Completed: " + itemsCompleted.size() + ", Errored: " + itemsErrored.size());
        if (!completed && itemsPending.size() == 0) {
            completed = true;
            logger.info("Rip completed!");
@ -193,6 +192,10 @@ public abstract class AbstractRipper
    public URL getURL() {
        return url;
    }
+    
+    public File getWorkingDir() {
+        return workingDir;
+    }

    public void setWorkingDir(URL url) throws IOException {
        String path = Utils.getWorkingDirectory().getCanonicalPath();
@ -224,6 +227,7 @@ public abstract class AbstractRipper
                return ripper;
            } catch (Exception e) {
                // Incompatible rippers *will* throw exceptions during instantiation.
+                logger.error("Excepion while instantiating: " + constructor.getClass().getName(), e);
            }
        }
        throw new Exception("No compatible ripper found");
@ -245,7 +249,9 @@ public abstract class AbstractRipper
        URL classURL = urls.nextElement();
        for (File f : new File(classURL.toURI()).listFiles()) {
            String className = f.getName();
-            if (!className.endsWith(".class") || className.contains("$")) {
+            if (!className.endsWith(".class")
+                    || className.contains("$")
+                    || className.endsWith("Test.class")) {
                // Ignore non-class or nested classes.
                continue;
            }
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java
@ -22,9 +22,9 @@ public class ImgurRipper extends AbstractRipper {
    private static final String DOMAIN = "imgur.com",
                                HOST   = "imgur";
    private static final Logger logger = Logger.getLogger(ImgurRipper.class);
-    
+
    private final int SLEEP_BETWEEN_ALBUMS;
-    
+
    static enum ALBUM_TYPE {
        ALBUM,
        USER,
@ -61,6 +61,8 @@ public class ImgurRipper extends AbstractRipper {
        if (u.indexOf('#') >= 0) {
            u = u.substring(0,  u.indexOf('#'));
        }
+        u = u.replace("https?://m\\.imgur\\.com", "http://imgur.com");
+        u = u.replace("https?://i\\.imgur\\.com", "http://imgur.com");
        return new URL(u);
    }

@ -204,14 +206,18 @@ public class ImgurRipper extends AbstractRipper {
            this.url = new URL("http://imgur.com/a/" + gid);
            return gid;
        }
-        p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.imgur\\.com/?$");
+        p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{3,})\\.imgur\\.com/?$");
        m = p.matcher(url.toExternalForm());
        if (m.matches()) {
            // Root imgur account
+            String gid = m.group(1);
+            if (gid.equals("i")) {
+                throw new MalformedURLException("Ripping i.imgur.com links not supported");
+            }
            albumType = ALBUM_TYPE.USER;
-            return m.group(1);
+            return gid;
        }
-        p = Pattern.compile("^https?://([a-zA-Z0-9\\-])\\.imgur\\.com/([a-zA-Z0-9])?$");
+        p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{3,})\\.imgur\\.com/([a-zA-Z0-9])?$");
        m = p.matcher(url.toExternalForm());
        if (m.matches()) {
            // Imgur account album
@ -223,9 +229,16 @@ public class ImgurRipper extends AbstractRipper {
        if (m.matches()) {
            // Series of imgur images
            albumType = ALBUM_TYPE.SERIES_OF_IMAGES;
-            return m.group(m.groupCount()).replaceAll(",", "-");
+            String gid = m.group(m.groupCount());
+            if (!gid.contains(",")) {
+                throw new MalformedURLException("Imgur image doesn't contain commas");
+            }
+            return gid.replaceAll(",", "-");
        }
        throw new MalformedURLException("Unexpected URL format: " + url.toExternalForm());
    }

+    public ALBUM_TYPE getAlbumType() {
+        return albumType;
+    }
 }
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java
@ -0,0 +1,140 @@
+package com.rarchives.ripme.ripper.rippers;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+
+import com.rarchives.ripme.ripper.AbstractRipper;
+
+public class InstagramRipper extends AbstractRipper {
+
+    private static final String DOMAIN = "instagram.com",
+                                HOST   = "instagram";
+    private static final Logger logger = Logger.getLogger(ImagearnRipper.class);
+
+    public InstagramRipper(URL url) throws IOException {
+        super(url);
+    }
+
+    @Override
+    public boolean canRip(URL url) {
+        return url.getHost().endsWith(DOMAIN);
+    }
+
+    @Override
+    public URL sanitizeURL(URL url) throws MalformedURLException {
+        Pattern p = Pattern.compile("^https?://instagram\\.com/p/([a-zA-Z0-9]{1,}).*$");
+        Matcher m = p.matcher(url.toExternalForm());
+        if (m.matches()) {
+            // Link to photo, not the user account
+            try {
+                url = getUserPageFromImage(url);
+            } catch (Exception e) {
+                logger.error("[!] Failed to get user page from " + url, e);
+                throw new MalformedURLException("Failed to retrieve user page from " + url);
+            }
+        }
+        p = Pattern.compile("^.*instagram.com/([a-zA-Z0-9]{3,}).*$");
+        m = p.matcher(url.toExternalForm());
+        if (!m.matches()) {
+            throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url);
+        }
+        return new URL("http://statigr.am/" + m.group(1));
+    }
+    
+    private URL getUserPageFromImage(URL url) throws IOException {
+        Document doc = Jsoup.connect(url.toExternalForm()).get();
+        for (Element element : doc.select("meta[property='og:description']")) {
+            String content = element.attr("content");
+            if (content.endsWith("'s photo on Instagram")) {
+                return new URL("http://statigr.am/" + content.substring(0, content.indexOf("'")));
+            }
+        }
+        throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url);
+    }
+    
+    private String getUserID(URL url) throws IOException {
+        logger.info("   Retrieving " + url);
+        Document doc = Jsoup.connect(this.url.toExternalForm()).get();
+        for (Element element : doc.select("input[id=user_public]")) {
+            return element.attr("value");
+        }
+        throw new IOException("Unable to find userID at " + this.url);
+    }
+
+    @Override
+    public void rip() throws IOException {
+        int index = 0;
+        String userID = getUserID(this.url);
+        String baseURL = "http://statigr.am/controller_nl.php?action=getPhotoUserPublic&user_id=" + userID;
+        String params = "";
+        while (true) {
+            String url = baseURL + params;
+            logger.info("    Retrieving " + url);
+            String jsonString = Jsoup.connect(url).ignoreContentType(true).execute().body();
+            JSONObject json = new JSONObject(jsonString);
+            JSONArray datas = json.getJSONArray("data");
+            String nextMaxID = "";
+            if (datas.length() == 0) {
+                break;
+            }
+            for (int i = 0; i < datas.length(); i++) {
+                JSONObject data = (JSONObject) datas.get(i);
+                if (data.has("id")) {
+                    nextMaxID = data.getString("id");
+                }
+                if (data.has("videos")) {
+                    index += 1;
+                    String video = data.getJSONObject("videos").getJSONObject("standard_resolution").getString("url");
+                    addURLToDownload(new URL(video), String.format("%03d_", index));
+                } else if (data.has("images")) {
+                    index += 1;
+                    String image = data.getJSONObject("images").getJSONObject("standard_resolution").getString("url");
+                    // addURLToDownload(new URL(image), String.format("%03d_", index));
+                    addURLToDownload(new URL(image));
+                }
+            }
+            JSONObject pagination = json.getJSONObject("pagination");
+            if (nextMaxID.equals("")) {
+                if (!pagination.has("next_max_id")) {
+                    break;
+                } else {
+                    nextMaxID = pagination.getString("next_max_id");
+                }
+            }
+            params = "&max_id=" + nextMaxID;
+            try {
+                Thread.sleep(3000);
+            } catch (InterruptedException e) {
+                logger.error("[!] Interrupted while waiting to load next album:", e);
+                break;
+            }
+        }
+        waitForThreads();
+    }
+
+    @Override
+    public String getHost() {
+        return HOST;
+    }
+
+    @Override
+    public String getGID(URL url) throws MalformedURLException {
+        Pattern p = Pattern.compile("^https?://statigr.am/([a-zA-Z0-9]{3,}).*$");
+        Matcher m = p.matcher(url.toExternalForm());
+        if (m.matches()) {
+            return m.group(1);
+        }
+        throw new MalformedURLException("Unable to find user in " + url);
+    }
+
+}
--- a/src/test/java/com/rarchives/ripme/tst/AppTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/AppTest.java
@ -1,4 +1,4 @@
-package com.rarchives.ripme;
+package com.rarchives.ripme.tst;

 import junit.framework.Test;
 import junit.framework.TestCase;
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ImgurRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ImgurRipperTest.java
@ -0,0 +1,78 @@
+package com.rarchives.ripme.tst.ripper.rippers;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import com.rarchives.ripme.ripper.rippers.ImgurRipper;
+
+public class ImgurRipperTest extends RippersTest {
+
+    public void testImgurURLFailures() throws IOException {
+        List<URL> failURLs = new ArrayList<URL>();
+        // Imgur urls that should not work
+        failURLs.add(new URL("http://imgur.com"));
+        failURLs.add(new URL("http://imgur.com/"));
+        failURLs.add(new URL("http://i.imgur.com"));
+        failURLs.add(new URL("http://i.imgur.com/"));
+        failURLs.add(new URL("http://imgur.com/image"));
+        failURLs.add(new URL("http://imgur.com/image.jpg"));
+        failURLs.add(new URL("http://i.imgur.com/image.jpg"));
+        for (URL url : failURLs) {
+            try {
+                new ImgurRipper(url);
+                fail("Instantiated ripper for URL that should not work: " + url);
+            } catch (Exception e) {
+                // Expected
+                continue;
+            }
+        }
+    }
+
+    public void testImgurURLPasses() throws IOException {
+        List<URL> passURLs    = new ArrayList<URL>();
+        // Imgur URLs that should work
+        passURLs.add(new URL("http://imgur.com/a/XPd4F"));
+        passURLs.add(new URL("http://imgur.com/a/XPd4F/"));
+        passURLs.add(new URL("http://imgur.com/a/WxG6f/all"));
+        passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/vertical#0"));
+        passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/horizontal#0"));
+        passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/grid#0"));
+        passURLs.add(new URL("http://imgur.com/YOdjht3,x5VxH9G,5juXjJ2"));
+        passURLs.add(new URL("http://markedone911.imgur.com"));
+        passURLs.add(new URL("http://markedone911.imgur.com/"));
+
+        for (URL url : passURLs) {
+            try {
+                ImgurRipper ripper = new ImgurRipper(url);
+                assertTrue(ripper.canRip(url));
+                deleteDir(ripper.getWorkingDir());
+            } catch (Exception e) {
+                fail("Failed to instantiate ripper for " + url);
+            }
+        }
+    }
+
+    public void testImgurAlbums() throws IOException {
+        List<URL> contentURLs = new ArrayList<URL>();
+        // URLs that should return more than 1 image
+        contentURLs.add(new URL("http://imgur.com/a/hqJIu")); // Vertical layout
+        contentURLs.add(new URL("http://imgur.com/a/dS9OQ#0")); // Horizontal layout
+        contentURLs.add(new URL("http://imgur.com/a/YpsW9#0")); // Grid layout
+        contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/vertical#0"));
+        contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/horizontal#0"));
+        contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/grid#0"));
+        for (URL url : contentURLs) {
+            try {
+                ImgurRipper ripper = new ImgurRipper(url);
+                ripper.rip();
+                assert(ripper.getWorkingDir().listFiles().length > 1);
+                deleteDir(ripper.getWorkingDir());
+            } catch (Exception e) {
+                fail("Error while ripping URL " + url + ": " + e.getMessage());
+            }
+        }
+    }
+
+}
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java
@ -0,0 +1,28 @@
+package com.rarchives.ripme.tst.ripper.rippers;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import com.rarchives.ripme.ripper.rippers.InstagramRipper;
+
+
+public class InstagramRipperTest extends RippersTest {
+    
+    public void testInstagramAlbums() throws IOException {
+        List<URL> contentURLs = new ArrayList<URL>();
+        contentURLs.add(new URL("http://instagram.com/feelgoodincc#"));
+        for (URL url : contentURLs) {
+            try {
+                InstagramRipper ripper = new InstagramRipper(url);
+                ripper.rip();
+                assert(ripper.getWorkingDir().listFiles().length > 1);
+                deleteDir(ripper.getWorkingDir());
+            } catch (Exception e) {
+                fail("Error while ripping URL " + url + ": " + e.getMessage());
+            }
+        }
+    }
+
+}
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java
@ -0,0 +1,22 @@
+package com.rarchives.ripme.tst.ripper.rippers;
+
+import java.io.File;
+
+import junit.framework.TestCase;
+
+public class RippersTest extends TestCase {
+
+    protected void deleteDir(File dir) {
+        return;
+        /*
+        for (File f : dir.listFiles()) {
+            if (f.isDirectory()) {
+                deleteDir(f);
+            }
+            f.delete();
+        }
+        dir.delete();
+        //*/
+    }
+
+}