Added instagram ripper, integration tests.
Also fixed parts of the imgur ripper.
This commit is contained in:
parent
c5c55055c2
commit
e2bb412d9f
@ -178,7 +178,6 @@ public abstract class AbstractRipper
|
||||
}
|
||||
|
||||
private void checkIfComplete() {
|
||||
System.err.println("Pending: " + itemsPending.size() + ", Completed: " + itemsCompleted.size() + ", Errored: " + itemsErrored.size());
|
||||
if (!completed && itemsPending.size() == 0) {
|
||||
completed = true;
|
||||
logger.info("Rip completed!");
|
||||
@ -193,6 +192,10 @@ public abstract class AbstractRipper
|
||||
public URL getURL() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public File getWorkingDir() {
|
||||
return workingDir;
|
||||
}
|
||||
|
||||
public void setWorkingDir(URL url) throws IOException {
|
||||
String path = Utils.getWorkingDirectory().getCanonicalPath();
|
||||
@ -224,6 +227,7 @@ public abstract class AbstractRipper
|
||||
return ripper;
|
||||
} catch (Exception e) {
|
||||
// Incompatible rippers *will* throw exceptions during instantiation.
|
||||
logger.error("Excepion while instantiating: " + constructor.getClass().getName(), e);
|
||||
}
|
||||
}
|
||||
throw new Exception("No compatible ripper found");
|
||||
@ -245,7 +249,9 @@ public abstract class AbstractRipper
|
||||
URL classURL = urls.nextElement();
|
||||
for (File f : new File(classURL.toURI()).listFiles()) {
|
||||
String className = f.getName();
|
||||
if (!className.endsWith(".class") || className.contains("$")) {
|
||||
if (!className.endsWith(".class")
|
||||
|| className.contains("$")
|
||||
|| className.endsWith("Test.class")) {
|
||||
// Ignore non-class or nested classes.
|
||||
continue;
|
||||
}
|
||||
|
@ -22,9 +22,9 @@ public class ImgurRipper extends AbstractRipper {
|
||||
private static final String DOMAIN = "imgur.com",
|
||||
HOST = "imgur";
|
||||
private static final Logger logger = Logger.getLogger(ImgurRipper.class);
|
||||
|
||||
|
||||
private final int SLEEP_BETWEEN_ALBUMS;
|
||||
|
||||
|
||||
static enum ALBUM_TYPE {
|
||||
ALBUM,
|
||||
USER,
|
||||
@ -61,6 +61,8 @@ public class ImgurRipper extends AbstractRipper {
|
||||
if (u.indexOf('#') >= 0) {
|
||||
u = u.substring(0, u.indexOf('#'));
|
||||
}
|
||||
u = u.replace("https?://m\\.imgur\\.com", "http://imgur.com");
|
||||
u = u.replace("https?://i\\.imgur\\.com", "http://imgur.com");
|
||||
return new URL(u);
|
||||
}
|
||||
|
||||
@ -204,14 +206,18 @@ public class ImgurRipper extends AbstractRipper {
|
||||
this.url = new URL("http://imgur.com/a/" + gid);
|
||||
return gid;
|
||||
}
|
||||
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.imgur\\.com/?$");
|
||||
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{3,})\\.imgur\\.com/?$");
|
||||
m = p.matcher(url.toExternalForm());
|
||||
if (m.matches()) {
|
||||
// Root imgur account
|
||||
String gid = m.group(1);
|
||||
if (gid.equals("i")) {
|
||||
throw new MalformedURLException("Ripping i.imgur.com links not supported");
|
||||
}
|
||||
albumType = ALBUM_TYPE.USER;
|
||||
return m.group(1);
|
||||
return gid;
|
||||
}
|
||||
p = Pattern.compile("^https?://([a-zA-Z0-9\\-])\\.imgur\\.com/([a-zA-Z0-9])?$");
|
||||
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{3,})\\.imgur\\.com/([a-zA-Z0-9])?$");
|
||||
m = p.matcher(url.toExternalForm());
|
||||
if (m.matches()) {
|
||||
// Imgur account album
|
||||
@ -223,9 +229,16 @@ public class ImgurRipper extends AbstractRipper {
|
||||
if (m.matches()) {
|
||||
// Series of imgur images
|
||||
albumType = ALBUM_TYPE.SERIES_OF_IMAGES;
|
||||
return m.group(m.groupCount()).replaceAll(",", "-");
|
||||
String gid = m.group(m.groupCount());
|
||||
if (!gid.contains(",")) {
|
||||
throw new MalformedURLException("Imgur image doesn't contain commas");
|
||||
}
|
||||
return gid.replaceAll(",", "-");
|
||||
}
|
||||
throw new MalformedURLException("Unexpected URL format: " + url.toExternalForm());
|
||||
}
|
||||
|
||||
public ALBUM_TYPE getAlbumType() {
|
||||
return albumType;
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,140 @@
|
||||
package com.rarchives.ripme.ripper.rippers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONObject;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import com.rarchives.ripme.ripper.AbstractRipper;
|
||||
|
||||
public class InstagramRipper extends AbstractRipper {
|
||||
|
||||
private static final String DOMAIN = "instagram.com",
|
||||
HOST = "instagram";
|
||||
private static final Logger logger = Logger.getLogger(ImagearnRipper.class);
|
||||
|
||||
public InstagramRipper(URL url) throws IOException {
|
||||
super(url);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canRip(URL url) {
|
||||
return url.getHost().endsWith(DOMAIN);
|
||||
}
|
||||
|
||||
@Override
|
||||
public URL sanitizeURL(URL url) throws MalformedURLException {
|
||||
Pattern p = Pattern.compile("^https?://instagram\\.com/p/([a-zA-Z0-9]{1,}).*$");
|
||||
Matcher m = p.matcher(url.toExternalForm());
|
||||
if (m.matches()) {
|
||||
// Link to photo, not the user account
|
||||
try {
|
||||
url = getUserPageFromImage(url);
|
||||
} catch (Exception e) {
|
||||
logger.error("[!] Failed to get user page from " + url, e);
|
||||
throw new MalformedURLException("Failed to retrieve user page from " + url);
|
||||
}
|
||||
}
|
||||
p = Pattern.compile("^.*instagram.com/([a-zA-Z0-9]{3,}).*$");
|
||||
m = p.matcher(url.toExternalForm());
|
||||
if (!m.matches()) {
|
||||
throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url);
|
||||
}
|
||||
return new URL("http://statigr.am/" + m.group(1));
|
||||
}
|
||||
|
||||
private URL getUserPageFromImage(URL url) throws IOException {
|
||||
Document doc = Jsoup.connect(url.toExternalForm()).get();
|
||||
for (Element element : doc.select("meta[property='og:description']")) {
|
||||
String content = element.attr("content");
|
||||
if (content.endsWith("'s photo on Instagram")) {
|
||||
return new URL("http://statigr.am/" + content.substring(0, content.indexOf("'")));
|
||||
}
|
||||
}
|
||||
throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url);
|
||||
}
|
||||
|
||||
private String getUserID(URL url) throws IOException {
|
||||
logger.info(" Retrieving " + url);
|
||||
Document doc = Jsoup.connect(this.url.toExternalForm()).get();
|
||||
for (Element element : doc.select("input[id=user_public]")) {
|
||||
return element.attr("value");
|
||||
}
|
||||
throw new IOException("Unable to find userID at " + this.url);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void rip() throws IOException {
|
||||
int index = 0;
|
||||
String userID = getUserID(this.url);
|
||||
String baseURL = "http://statigr.am/controller_nl.php?action=getPhotoUserPublic&user_id=" + userID;
|
||||
String params = "";
|
||||
while (true) {
|
||||
String url = baseURL + params;
|
||||
logger.info(" Retrieving " + url);
|
||||
String jsonString = Jsoup.connect(url).ignoreContentType(true).execute().body();
|
||||
JSONObject json = new JSONObject(jsonString);
|
||||
JSONArray datas = json.getJSONArray("data");
|
||||
String nextMaxID = "";
|
||||
if (datas.length() == 0) {
|
||||
break;
|
||||
}
|
||||
for (int i = 0; i < datas.length(); i++) {
|
||||
JSONObject data = (JSONObject) datas.get(i);
|
||||
if (data.has("id")) {
|
||||
nextMaxID = data.getString("id");
|
||||
}
|
||||
if (data.has("videos")) {
|
||||
index += 1;
|
||||
String video = data.getJSONObject("videos").getJSONObject("standard_resolution").getString("url");
|
||||
addURLToDownload(new URL(video), String.format("%03d_", index));
|
||||
} else if (data.has("images")) {
|
||||
index += 1;
|
||||
String image = data.getJSONObject("images").getJSONObject("standard_resolution").getString("url");
|
||||
// addURLToDownload(new URL(image), String.format("%03d_", index));
|
||||
addURLToDownload(new URL(image));
|
||||
}
|
||||
}
|
||||
JSONObject pagination = json.getJSONObject("pagination");
|
||||
if (nextMaxID.equals("")) {
|
||||
if (!pagination.has("next_max_id")) {
|
||||
break;
|
||||
} else {
|
||||
nextMaxID = pagination.getString("next_max_id");
|
||||
}
|
||||
}
|
||||
params = "&max_id=" + nextMaxID;
|
||||
try {
|
||||
Thread.sleep(3000);
|
||||
} catch (InterruptedException e) {
|
||||
logger.error("[!] Interrupted while waiting to load next album:", e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
waitForThreads();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getHost() {
|
||||
return HOST;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getGID(URL url) throws MalformedURLException {
|
||||
Pattern p = Pattern.compile("^https?://statigr.am/([a-zA-Z0-9]{3,}).*$");
|
||||
Matcher m = p.matcher(url.toExternalForm());
|
||||
if (m.matches()) {
|
||||
return m.group(1);
|
||||
}
|
||||
throw new MalformedURLException("Unable to find user in " + url);
|
||||
}
|
||||
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package com.rarchives.ripme;
|
||||
package com.rarchives.ripme.tst;
|
||||
|
||||
import junit.framework.Test;
|
||||
import junit.framework.TestCase;
|
@ -0,0 +1,78 @@
|
||||
package com.rarchives.ripme.tst.ripper.rippers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.rarchives.ripme.ripper.rippers.ImgurRipper;
|
||||
|
||||
public class ImgurRipperTest extends RippersTest {
|
||||
|
||||
public void testImgurURLFailures() throws IOException {
|
||||
List<URL> failURLs = new ArrayList<URL>();
|
||||
// Imgur urls that should not work
|
||||
failURLs.add(new URL("http://imgur.com"));
|
||||
failURLs.add(new URL("http://imgur.com/"));
|
||||
failURLs.add(new URL("http://i.imgur.com"));
|
||||
failURLs.add(new URL("http://i.imgur.com/"));
|
||||
failURLs.add(new URL("http://imgur.com/image"));
|
||||
failURLs.add(new URL("http://imgur.com/image.jpg"));
|
||||
failURLs.add(new URL("http://i.imgur.com/image.jpg"));
|
||||
for (URL url : failURLs) {
|
||||
try {
|
||||
new ImgurRipper(url);
|
||||
fail("Instantiated ripper for URL that should not work: " + url);
|
||||
} catch (Exception e) {
|
||||
// Expected
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testImgurURLPasses() throws IOException {
|
||||
List<URL> passURLs = new ArrayList<URL>();
|
||||
// Imgur URLs that should work
|
||||
passURLs.add(new URL("http://imgur.com/a/XPd4F"));
|
||||
passURLs.add(new URL("http://imgur.com/a/XPd4F/"));
|
||||
passURLs.add(new URL("http://imgur.com/a/WxG6f/all"));
|
||||
passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/vertical#0"));
|
||||
passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/horizontal#0"));
|
||||
passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/grid#0"));
|
||||
passURLs.add(new URL("http://imgur.com/YOdjht3,x5VxH9G,5juXjJ2"));
|
||||
passURLs.add(new URL("http://markedone911.imgur.com"));
|
||||
passURLs.add(new URL("http://markedone911.imgur.com/"));
|
||||
|
||||
for (URL url : passURLs) {
|
||||
try {
|
||||
ImgurRipper ripper = new ImgurRipper(url);
|
||||
assertTrue(ripper.canRip(url));
|
||||
deleteDir(ripper.getWorkingDir());
|
||||
} catch (Exception e) {
|
||||
fail("Failed to instantiate ripper for " + url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testImgurAlbums() throws IOException {
|
||||
List<URL> contentURLs = new ArrayList<URL>();
|
||||
// URLs that should return more than 1 image
|
||||
contentURLs.add(new URL("http://imgur.com/a/hqJIu")); // Vertical layout
|
||||
contentURLs.add(new URL("http://imgur.com/a/dS9OQ#0")); // Horizontal layout
|
||||
contentURLs.add(new URL("http://imgur.com/a/YpsW9#0")); // Grid layout
|
||||
contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/vertical#0"));
|
||||
contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/horizontal#0"));
|
||||
contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/grid#0"));
|
||||
for (URL url : contentURLs) {
|
||||
try {
|
||||
ImgurRipper ripper = new ImgurRipper(url);
|
||||
ripper.rip();
|
||||
assert(ripper.getWorkingDir().listFiles().length > 1);
|
||||
deleteDir(ripper.getWorkingDir());
|
||||
} catch (Exception e) {
|
||||
fail("Error while ripping URL " + url + ": " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
package com.rarchives.ripme.tst.ripper.rippers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.rarchives.ripme.ripper.rippers.InstagramRipper;
|
||||
|
||||
|
||||
public class InstagramRipperTest extends RippersTest {
|
||||
|
||||
public void testInstagramAlbums() throws IOException {
|
||||
List<URL> contentURLs = new ArrayList<URL>();
|
||||
contentURLs.add(new URL("http://instagram.com/feelgoodincc#"));
|
||||
for (URL url : contentURLs) {
|
||||
try {
|
||||
InstagramRipper ripper = new InstagramRipper(url);
|
||||
ripper.rip();
|
||||
assert(ripper.getWorkingDir().listFiles().length > 1);
|
||||
deleteDir(ripper.getWorkingDir());
|
||||
} catch (Exception e) {
|
||||
fail("Error while ripping URL " + url + ": " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
package com.rarchives.ripme.tst.ripper.rippers;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class RippersTest extends TestCase {
|
||||
|
||||
protected void deleteDir(File dir) {
|
||||
return;
|
||||
/*
|
||||
for (File f : dir.listFiles()) {
|
||||
if (f.isDirectory()) {
|
||||
deleteDir(f);
|
||||
}
|
||||
f.delete();
|
||||
}
|
||||
dir.delete();
|
||||
//*/
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user