Added instagram ripper, integration tests.
Also fixed parts of the imgur ripper.
This commit is contained in:
parent
c5c55055c2
commit
e2bb412d9f
@ -178,7 +178,6 @@ public abstract class AbstractRipper
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void checkIfComplete() {
|
private void checkIfComplete() {
|
||||||
System.err.println("Pending: " + itemsPending.size() + ", Completed: " + itemsCompleted.size() + ", Errored: " + itemsErrored.size());
|
|
||||||
if (!completed && itemsPending.size() == 0) {
|
if (!completed && itemsPending.size() == 0) {
|
||||||
completed = true;
|
completed = true;
|
||||||
logger.info("Rip completed!");
|
logger.info("Rip completed!");
|
||||||
@ -194,6 +193,10 @@ public abstract class AbstractRipper
|
|||||||
return url;
|
return url;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public File getWorkingDir() {
|
||||||
|
return workingDir;
|
||||||
|
}
|
||||||
|
|
||||||
public void setWorkingDir(URL url) throws IOException {
|
public void setWorkingDir(URL url) throws IOException {
|
||||||
String path = Utils.getWorkingDirectory().getCanonicalPath();
|
String path = Utils.getWorkingDirectory().getCanonicalPath();
|
||||||
if (!path.endsWith(File.separator)) {
|
if (!path.endsWith(File.separator)) {
|
||||||
@ -224,6 +227,7 @@ public abstract class AbstractRipper
|
|||||||
return ripper;
|
return ripper;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
// Incompatible rippers *will* throw exceptions during instantiation.
|
// Incompatible rippers *will* throw exceptions during instantiation.
|
||||||
|
logger.error("Excepion while instantiating: " + constructor.getClass().getName(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
throw new Exception("No compatible ripper found");
|
throw new Exception("No compatible ripper found");
|
||||||
@ -245,7 +249,9 @@ public abstract class AbstractRipper
|
|||||||
URL classURL = urls.nextElement();
|
URL classURL = urls.nextElement();
|
||||||
for (File f : new File(classURL.toURI()).listFiles()) {
|
for (File f : new File(classURL.toURI()).listFiles()) {
|
||||||
String className = f.getName();
|
String className = f.getName();
|
||||||
if (!className.endsWith(".class") || className.contains("$")) {
|
if (!className.endsWith(".class")
|
||||||
|
|| className.contains("$")
|
||||||
|
|| className.endsWith("Test.class")) {
|
||||||
// Ignore non-class or nested classes.
|
// Ignore non-class or nested classes.
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -61,6 +61,8 @@ public class ImgurRipper extends AbstractRipper {
|
|||||||
if (u.indexOf('#') >= 0) {
|
if (u.indexOf('#') >= 0) {
|
||||||
u = u.substring(0, u.indexOf('#'));
|
u = u.substring(0, u.indexOf('#'));
|
||||||
}
|
}
|
||||||
|
u = u.replace("https?://m\\.imgur\\.com", "http://imgur.com");
|
||||||
|
u = u.replace("https?://i\\.imgur\\.com", "http://imgur.com");
|
||||||
return new URL(u);
|
return new URL(u);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -204,14 +206,18 @@ public class ImgurRipper extends AbstractRipper {
|
|||||||
this.url = new URL("http://imgur.com/a/" + gid);
|
this.url = new URL("http://imgur.com/a/" + gid);
|
||||||
return gid;
|
return gid;
|
||||||
}
|
}
|
||||||
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.imgur\\.com/?$");
|
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{3,})\\.imgur\\.com/?$");
|
||||||
m = p.matcher(url.toExternalForm());
|
m = p.matcher(url.toExternalForm());
|
||||||
if (m.matches()) {
|
if (m.matches()) {
|
||||||
// Root imgur account
|
// Root imgur account
|
||||||
albumType = ALBUM_TYPE.USER;
|
String gid = m.group(1);
|
||||||
return m.group(1);
|
if (gid.equals("i")) {
|
||||||
|
throw new MalformedURLException("Ripping i.imgur.com links not supported");
|
||||||
}
|
}
|
||||||
p = Pattern.compile("^https?://([a-zA-Z0-9\\-])\\.imgur\\.com/([a-zA-Z0-9])?$");
|
albumType = ALBUM_TYPE.USER;
|
||||||
|
return gid;
|
||||||
|
}
|
||||||
|
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{3,})\\.imgur\\.com/([a-zA-Z0-9])?$");
|
||||||
m = p.matcher(url.toExternalForm());
|
m = p.matcher(url.toExternalForm());
|
||||||
if (m.matches()) {
|
if (m.matches()) {
|
||||||
// Imgur account album
|
// Imgur account album
|
||||||
@ -223,9 +229,16 @@ public class ImgurRipper extends AbstractRipper {
|
|||||||
if (m.matches()) {
|
if (m.matches()) {
|
||||||
// Series of imgur images
|
// Series of imgur images
|
||||||
albumType = ALBUM_TYPE.SERIES_OF_IMAGES;
|
albumType = ALBUM_TYPE.SERIES_OF_IMAGES;
|
||||||
return m.group(m.groupCount()).replaceAll(",", "-");
|
String gid = m.group(m.groupCount());
|
||||||
|
if (!gid.contains(",")) {
|
||||||
|
throw new MalformedURLException("Imgur image doesn't contain commas");
|
||||||
|
}
|
||||||
|
return gid.replaceAll(",", "-");
|
||||||
}
|
}
|
||||||
throw new MalformedURLException("Unexpected URL format: " + url.toExternalForm());
|
throw new MalformedURLException("Unexpected URL format: " + url.toExternalForm());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public ALBUM_TYPE getAlbumType() {
|
||||||
|
return albumType;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,140 @@
|
|||||||
|
package com.rarchives.ripme.ripper.rippers;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.json.JSONArray;
|
||||||
|
import org.json.JSONObject;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
|
import com.rarchives.ripme.ripper.AbstractRipper;
|
||||||
|
|
||||||
|
public class InstagramRipper extends AbstractRipper {
|
||||||
|
|
||||||
|
private static final String DOMAIN = "instagram.com",
|
||||||
|
HOST = "instagram";
|
||||||
|
private static final Logger logger = Logger.getLogger(ImagearnRipper.class);
|
||||||
|
|
||||||
|
public InstagramRipper(URL url) throws IOException {
|
||||||
|
super(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean canRip(URL url) {
|
||||||
|
return url.getHost().endsWith(DOMAIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public URL sanitizeURL(URL url) throws MalformedURLException {
|
||||||
|
Pattern p = Pattern.compile("^https?://instagram\\.com/p/([a-zA-Z0-9]{1,}).*$");
|
||||||
|
Matcher m = p.matcher(url.toExternalForm());
|
||||||
|
if (m.matches()) {
|
||||||
|
// Link to photo, not the user account
|
||||||
|
try {
|
||||||
|
url = getUserPageFromImage(url);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("[!] Failed to get user page from " + url, e);
|
||||||
|
throw new MalformedURLException("Failed to retrieve user page from " + url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
p = Pattern.compile("^.*instagram.com/([a-zA-Z0-9]{3,}).*$");
|
||||||
|
m = p.matcher(url.toExternalForm());
|
||||||
|
if (!m.matches()) {
|
||||||
|
throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url);
|
||||||
|
}
|
||||||
|
return new URL("http://statigr.am/" + m.group(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
private URL getUserPageFromImage(URL url) throws IOException {
|
||||||
|
Document doc = Jsoup.connect(url.toExternalForm()).get();
|
||||||
|
for (Element element : doc.select("meta[property='og:description']")) {
|
||||||
|
String content = element.attr("content");
|
||||||
|
if (content.endsWith("'s photo on Instagram")) {
|
||||||
|
return new URL("http://statigr.am/" + content.substring(0, content.indexOf("'")));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getUserID(URL url) throws IOException {
|
||||||
|
logger.info(" Retrieving " + url);
|
||||||
|
Document doc = Jsoup.connect(this.url.toExternalForm()).get();
|
||||||
|
for (Element element : doc.select("input[id=user_public]")) {
|
||||||
|
return element.attr("value");
|
||||||
|
}
|
||||||
|
throw new IOException("Unable to find userID at " + this.url);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void rip() throws IOException {
|
||||||
|
int index = 0;
|
||||||
|
String userID = getUserID(this.url);
|
||||||
|
String baseURL = "http://statigr.am/controller_nl.php?action=getPhotoUserPublic&user_id=" + userID;
|
||||||
|
String params = "";
|
||||||
|
while (true) {
|
||||||
|
String url = baseURL + params;
|
||||||
|
logger.info(" Retrieving " + url);
|
||||||
|
String jsonString = Jsoup.connect(url).ignoreContentType(true).execute().body();
|
||||||
|
JSONObject json = new JSONObject(jsonString);
|
||||||
|
JSONArray datas = json.getJSONArray("data");
|
||||||
|
String nextMaxID = "";
|
||||||
|
if (datas.length() == 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < datas.length(); i++) {
|
||||||
|
JSONObject data = (JSONObject) datas.get(i);
|
||||||
|
if (data.has("id")) {
|
||||||
|
nextMaxID = data.getString("id");
|
||||||
|
}
|
||||||
|
if (data.has("videos")) {
|
||||||
|
index += 1;
|
||||||
|
String video = data.getJSONObject("videos").getJSONObject("standard_resolution").getString("url");
|
||||||
|
addURLToDownload(new URL(video), String.format("%03d_", index));
|
||||||
|
} else if (data.has("images")) {
|
||||||
|
index += 1;
|
||||||
|
String image = data.getJSONObject("images").getJSONObject("standard_resolution").getString("url");
|
||||||
|
// addURLToDownload(new URL(image), String.format("%03d_", index));
|
||||||
|
addURLToDownload(new URL(image));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
JSONObject pagination = json.getJSONObject("pagination");
|
||||||
|
if (nextMaxID.equals("")) {
|
||||||
|
if (!pagination.has("next_max_id")) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
nextMaxID = pagination.getString("next_max_id");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
params = "&max_id=" + nextMaxID;
|
||||||
|
try {
|
||||||
|
Thread.sleep(3000);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
logger.error("[!] Interrupted while waiting to load next album:", e);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
waitForThreads();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getHost() {
|
||||||
|
return HOST;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getGID(URL url) throws MalformedURLException {
|
||||||
|
Pattern p = Pattern.compile("^https?://statigr.am/([a-zA-Z0-9]{3,}).*$");
|
||||||
|
Matcher m = p.matcher(url.toExternalForm());
|
||||||
|
if (m.matches()) {
|
||||||
|
return m.group(1);
|
||||||
|
}
|
||||||
|
throw new MalformedURLException("Unable to find user in " + url);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package com.rarchives.ripme;
|
package com.rarchives.ripme.tst;
|
||||||
|
|
||||||
import junit.framework.Test;
|
import junit.framework.Test;
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
@ -0,0 +1,78 @@
|
|||||||
|
package com.rarchives.ripme.tst.ripper.rippers;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.rarchives.ripme.ripper.rippers.ImgurRipper;
|
||||||
|
|
||||||
|
public class ImgurRipperTest extends RippersTest {
|
||||||
|
|
||||||
|
public void testImgurURLFailures() throws IOException {
|
||||||
|
List<URL> failURLs = new ArrayList<URL>();
|
||||||
|
// Imgur urls that should not work
|
||||||
|
failURLs.add(new URL("http://imgur.com"));
|
||||||
|
failURLs.add(new URL("http://imgur.com/"));
|
||||||
|
failURLs.add(new URL("http://i.imgur.com"));
|
||||||
|
failURLs.add(new URL("http://i.imgur.com/"));
|
||||||
|
failURLs.add(new URL("http://imgur.com/image"));
|
||||||
|
failURLs.add(new URL("http://imgur.com/image.jpg"));
|
||||||
|
failURLs.add(new URL("http://i.imgur.com/image.jpg"));
|
||||||
|
for (URL url : failURLs) {
|
||||||
|
try {
|
||||||
|
new ImgurRipper(url);
|
||||||
|
fail("Instantiated ripper for URL that should not work: " + url);
|
||||||
|
} catch (Exception e) {
|
||||||
|
// Expected
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testImgurURLPasses() throws IOException {
|
||||||
|
List<URL> passURLs = new ArrayList<URL>();
|
||||||
|
// Imgur URLs that should work
|
||||||
|
passURLs.add(new URL("http://imgur.com/a/XPd4F"));
|
||||||
|
passURLs.add(new URL("http://imgur.com/a/XPd4F/"));
|
||||||
|
passURLs.add(new URL("http://imgur.com/a/WxG6f/all"));
|
||||||
|
passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/vertical#0"));
|
||||||
|
passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/horizontal#0"));
|
||||||
|
passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/grid#0"));
|
||||||
|
passURLs.add(new URL("http://imgur.com/YOdjht3,x5VxH9G,5juXjJ2"));
|
||||||
|
passURLs.add(new URL("http://markedone911.imgur.com"));
|
||||||
|
passURLs.add(new URL("http://markedone911.imgur.com/"));
|
||||||
|
|
||||||
|
for (URL url : passURLs) {
|
||||||
|
try {
|
||||||
|
ImgurRipper ripper = new ImgurRipper(url);
|
||||||
|
assertTrue(ripper.canRip(url));
|
||||||
|
deleteDir(ripper.getWorkingDir());
|
||||||
|
} catch (Exception e) {
|
||||||
|
fail("Failed to instantiate ripper for " + url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testImgurAlbums() throws IOException {
|
||||||
|
List<URL> contentURLs = new ArrayList<URL>();
|
||||||
|
// URLs that should return more than 1 image
|
||||||
|
contentURLs.add(new URL("http://imgur.com/a/hqJIu")); // Vertical layout
|
||||||
|
contentURLs.add(new URL("http://imgur.com/a/dS9OQ#0")); // Horizontal layout
|
||||||
|
contentURLs.add(new URL("http://imgur.com/a/YpsW9#0")); // Grid layout
|
||||||
|
contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/vertical#0"));
|
||||||
|
contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/horizontal#0"));
|
||||||
|
contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/grid#0"));
|
||||||
|
for (URL url : contentURLs) {
|
||||||
|
try {
|
||||||
|
ImgurRipper ripper = new ImgurRipper(url);
|
||||||
|
ripper.rip();
|
||||||
|
assert(ripper.getWorkingDir().listFiles().length > 1);
|
||||||
|
deleteDir(ripper.getWorkingDir());
|
||||||
|
} catch (Exception e) {
|
||||||
|
fail("Error while ripping URL " + url + ": " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,28 @@
|
|||||||
|
package com.rarchives.ripme.tst.ripper.rippers;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.rarchives.ripme.ripper.rippers.InstagramRipper;
|
||||||
|
|
||||||
|
|
||||||
|
public class InstagramRipperTest extends RippersTest {
|
||||||
|
|
||||||
|
public void testInstagramAlbums() throws IOException {
|
||||||
|
List<URL> contentURLs = new ArrayList<URL>();
|
||||||
|
contentURLs.add(new URL("http://instagram.com/feelgoodincc#"));
|
||||||
|
for (URL url : contentURLs) {
|
||||||
|
try {
|
||||||
|
InstagramRipper ripper = new InstagramRipper(url);
|
||||||
|
ripper.rip();
|
||||||
|
assert(ripper.getWorkingDir().listFiles().length > 1);
|
||||||
|
deleteDir(ripper.getWorkingDir());
|
||||||
|
} catch (Exception e) {
|
||||||
|
fail("Error while ripping URL " + url + ": " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,22 @@
|
|||||||
|
package com.rarchives.ripme.tst.ripper.rippers;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
public class RippersTest extends TestCase {
|
||||||
|
|
||||||
|
protected void deleteDir(File dir) {
|
||||||
|
return;
|
||||||
|
/*
|
||||||
|
for (File f : dir.listFiles()) {
|
||||||
|
if (f.isDirectory()) {
|
||||||
|
deleteDir(f);
|
||||||
|
}
|
||||||
|
f.delete();
|
||||||
|
}
|
||||||
|
dir.delete();
|
||||||
|
//*/
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user