Added instagram ripper, integration tests.

Also fixed parts of the imgur ripper.
This commit is contained in:
4pr0n 2014-03-03 00:44:07 -08:00
parent c5c55055c2
commit e2bb412d9f
7 changed files with 296 additions and 9 deletions

View File

@ -178,7 +178,6 @@ public abstract class AbstractRipper
}
private void checkIfComplete() {
System.err.println("Pending: " + itemsPending.size() + ", Completed: " + itemsCompleted.size() + ", Errored: " + itemsErrored.size());
if (!completed && itemsPending.size() == 0) {
completed = true;
logger.info("Rip completed!");
@ -193,6 +192,10 @@ public abstract class AbstractRipper
public URL getURL() {
return url;
}
public File getWorkingDir() {
return workingDir;
}
public void setWorkingDir(URL url) throws IOException {
String path = Utils.getWorkingDirectory().getCanonicalPath();
@ -224,6 +227,7 @@ public abstract class AbstractRipper
return ripper;
} catch (Exception e) {
// Incompatible rippers *will* throw exceptions during instantiation.
logger.error("Excepion while instantiating: " + constructor.getClass().getName(), e);
}
}
throw new Exception("No compatible ripper found");
@ -245,7 +249,9 @@ public abstract class AbstractRipper
URL classURL = urls.nextElement();
for (File f : new File(classURL.toURI()).listFiles()) {
String className = f.getName();
if (!className.endsWith(".class") || className.contains("$")) {
if (!className.endsWith(".class")
|| className.contains("$")
|| className.endsWith("Test.class")) {
// Ignore non-class or nested classes.
continue;
}

View File

@ -22,9 +22,9 @@ public class ImgurRipper extends AbstractRipper {
private static final String DOMAIN = "imgur.com",
HOST = "imgur";
private static final Logger logger = Logger.getLogger(ImgurRipper.class);
private final int SLEEP_BETWEEN_ALBUMS;
static enum ALBUM_TYPE {
ALBUM,
USER,
@ -61,6 +61,8 @@ public class ImgurRipper extends AbstractRipper {
if (u.indexOf('#') >= 0) {
u = u.substring(0, u.indexOf('#'));
}
u = u.replace("https?://m\\.imgur\\.com", "http://imgur.com");
u = u.replace("https?://i\\.imgur\\.com", "http://imgur.com");
return new URL(u);
}
@ -204,14 +206,18 @@ public class ImgurRipper extends AbstractRipper {
this.url = new URL("http://imgur.com/a/" + gid);
return gid;
}
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.imgur\\.com/?$");
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{3,})\\.imgur\\.com/?$");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
// Root imgur account
String gid = m.group(1);
if (gid.equals("i")) {
throw new MalformedURLException("Ripping i.imgur.com links not supported");
}
albumType = ALBUM_TYPE.USER;
return m.group(1);
return gid;
}
p = Pattern.compile("^https?://([a-zA-Z0-9\\-])\\.imgur\\.com/([a-zA-Z0-9])?$");
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{3,})\\.imgur\\.com/([a-zA-Z0-9])?$");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
// Imgur account album
@ -223,9 +229,16 @@ public class ImgurRipper extends AbstractRipper {
if (m.matches()) {
// Series of imgur images
albumType = ALBUM_TYPE.SERIES_OF_IMAGES;
return m.group(m.groupCount()).replaceAll(",", "-");
String gid = m.group(m.groupCount());
if (!gid.contains(",")) {
throw new MalformedURLException("Imgur image doesn't contain commas");
}
return gid.replaceAll(",", "-");
}
throw new MalformedURLException("Unexpected URL format: " + url.toExternalForm());
}
public ALBUM_TYPE getAlbumType() {
return albumType;
}
}

View File

@ -0,0 +1,140 @@
package com.rarchives.ripme.ripper.rippers;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import com.rarchives.ripme.ripper.AbstractRipper;
public class InstagramRipper extends AbstractRipper {
private static final String DOMAIN = "instagram.com",
HOST = "instagram";
private static final Logger logger = Logger.getLogger(ImagearnRipper.class);
public InstagramRipper(URL url) throws IOException {
super(url);
}
@Override
public boolean canRip(URL url) {
return url.getHost().endsWith(DOMAIN);
}
@Override
public URL sanitizeURL(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://instagram\\.com/p/([a-zA-Z0-9]{1,}).*$");
Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) {
// Link to photo, not the user account
try {
url = getUserPageFromImage(url);
} catch (Exception e) {
logger.error("[!] Failed to get user page from " + url, e);
throw new MalformedURLException("Failed to retrieve user page from " + url);
}
}
p = Pattern.compile("^.*instagram.com/([a-zA-Z0-9]{3,}).*$");
m = p.matcher(url.toExternalForm());
if (!m.matches()) {
throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url);
}
return new URL("http://statigr.am/" + m.group(1));
}
private URL getUserPageFromImage(URL url) throws IOException {
Document doc = Jsoup.connect(url.toExternalForm()).get();
for (Element element : doc.select("meta[property='og:description']")) {
String content = element.attr("content");
if (content.endsWith("'s photo on Instagram")) {
return new URL("http://statigr.am/" + content.substring(0, content.indexOf("'")));
}
}
throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url);
}
private String getUserID(URL url) throws IOException {
logger.info(" Retrieving " + url);
Document doc = Jsoup.connect(this.url.toExternalForm()).get();
for (Element element : doc.select("input[id=user_public]")) {
return element.attr("value");
}
throw new IOException("Unable to find userID at " + this.url);
}
@Override
public void rip() throws IOException {
int index = 0;
String userID = getUserID(this.url);
String baseURL = "http://statigr.am/controller_nl.php?action=getPhotoUserPublic&user_id=" + userID;
String params = "";
while (true) {
String url = baseURL + params;
logger.info(" Retrieving " + url);
String jsonString = Jsoup.connect(url).ignoreContentType(true).execute().body();
JSONObject json = new JSONObject(jsonString);
JSONArray datas = json.getJSONArray("data");
String nextMaxID = "";
if (datas.length() == 0) {
break;
}
for (int i = 0; i < datas.length(); i++) {
JSONObject data = (JSONObject) datas.get(i);
if (data.has("id")) {
nextMaxID = data.getString("id");
}
if (data.has("videos")) {
index += 1;
String video = data.getJSONObject("videos").getJSONObject("standard_resolution").getString("url");
addURLToDownload(new URL(video), String.format("%03d_", index));
} else if (data.has("images")) {
index += 1;
String image = data.getJSONObject("images").getJSONObject("standard_resolution").getString("url");
// addURLToDownload(new URL(image), String.format("%03d_", index));
addURLToDownload(new URL(image));
}
}
JSONObject pagination = json.getJSONObject("pagination");
if (nextMaxID.equals("")) {
if (!pagination.has("next_max_id")) {
break;
} else {
nextMaxID = pagination.getString("next_max_id");
}
}
params = "&max_id=" + nextMaxID;
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
logger.error("[!] Interrupted while waiting to load next album:", e);
break;
}
}
waitForThreads();
}
@Override
public String getHost() {
return HOST;
}
@Override
public String getGID(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://statigr.am/([a-zA-Z0-9]{3,}).*$");
Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) {
return m.group(1);
}
throw new MalformedURLException("Unable to find user in " + url);
}
}

View File

@ -1,4 +1,4 @@
package com.rarchives.ripme;
package com.rarchives.ripme.tst;
import junit.framework.Test;
import junit.framework.TestCase;

View File

@ -0,0 +1,78 @@
package com.rarchives.ripme.tst.ripper.rippers;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import com.rarchives.ripme.ripper.rippers.ImgurRipper;
public class ImgurRipperTest extends RippersTest {
public void testImgurURLFailures() throws IOException {
List<URL> failURLs = new ArrayList<URL>();
// Imgur urls that should not work
failURLs.add(new URL("http://imgur.com"));
failURLs.add(new URL("http://imgur.com/"));
failURLs.add(new URL("http://i.imgur.com"));
failURLs.add(new URL("http://i.imgur.com/"));
failURLs.add(new URL("http://imgur.com/image"));
failURLs.add(new URL("http://imgur.com/image.jpg"));
failURLs.add(new URL("http://i.imgur.com/image.jpg"));
for (URL url : failURLs) {
try {
new ImgurRipper(url);
fail("Instantiated ripper for URL that should not work: " + url);
} catch (Exception e) {
// Expected
continue;
}
}
}
public void testImgurURLPasses() throws IOException {
List<URL> passURLs = new ArrayList<URL>();
// Imgur URLs that should work
passURLs.add(new URL("http://imgur.com/a/XPd4F"));
passURLs.add(new URL("http://imgur.com/a/XPd4F/"));
passURLs.add(new URL("http://imgur.com/a/WxG6f/all"));
passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/vertical#0"));
passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/horizontal#0"));
passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/grid#0"));
passURLs.add(new URL("http://imgur.com/YOdjht3,x5VxH9G,5juXjJ2"));
passURLs.add(new URL("http://markedone911.imgur.com"));
passURLs.add(new URL("http://markedone911.imgur.com/"));
for (URL url : passURLs) {
try {
ImgurRipper ripper = new ImgurRipper(url);
assertTrue(ripper.canRip(url));
deleteDir(ripper.getWorkingDir());
} catch (Exception e) {
fail("Failed to instantiate ripper for " + url);
}
}
}
public void testImgurAlbums() throws IOException {
List<URL> contentURLs = new ArrayList<URL>();
// URLs that should return more than 1 image
contentURLs.add(new URL("http://imgur.com/a/hqJIu")); // Vertical layout
contentURLs.add(new URL("http://imgur.com/a/dS9OQ#0")); // Horizontal layout
contentURLs.add(new URL("http://imgur.com/a/YpsW9#0")); // Grid layout
contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/vertical#0"));
contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/horizontal#0"));
contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/grid#0"));
for (URL url : contentURLs) {
try {
ImgurRipper ripper = new ImgurRipper(url);
ripper.rip();
assert(ripper.getWorkingDir().listFiles().length > 1);
deleteDir(ripper.getWorkingDir());
} catch (Exception e) {
fail("Error while ripping URL " + url + ": " + e.getMessage());
}
}
}
}

View File

@ -0,0 +1,28 @@
package com.rarchives.ripme.tst.ripper.rippers;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import com.rarchives.ripme.ripper.rippers.InstagramRipper;
public class InstagramRipperTest extends RippersTest {
public void testInstagramAlbums() throws IOException {
List<URL> contentURLs = new ArrayList<URL>();
contentURLs.add(new URL("http://instagram.com/feelgoodincc#"));
for (URL url : contentURLs) {
try {
InstagramRipper ripper = new InstagramRipper(url);
ripper.rip();
assert(ripper.getWorkingDir().listFiles().length > 1);
deleteDir(ripper.getWorkingDir());
} catch (Exception e) {
fail("Error while ripping URL " + url + ": " + e.getMessage());
}
}
}
}

View File

@ -0,0 +1,22 @@
package com.rarchives.ripme.tst.ripper.rippers;
import java.io.File;
import junit.framework.TestCase;
public class RippersTest extends TestCase {
protected void deleteDir(File dir) {
return;
/*
for (File f : dir.listFiles()) {
if (f.isDirectory()) {
deleteDir(f);
}
f.delete();
}
dir.delete();
//*/
}
}