From e49cab1254a6f387b65d52d9497b6e48bbcf98bc Mon Sep 17 00:00:00 2001 From: Mads Date: Thu, 13 Mar 2014 20:13:01 +0100 Subject: [PATCH] Fixed RedditRipper (Removed GoneWildRipper). RedditRipper retries download on timeout (should be a global setting) --- .../ripme/ripper/rippers/GonewildRipper.java | 112 ------------------ .../ripme/ripper/rippers/RedditRipper.java | 26 +++- .../java/com/rarchives/ripme/utils/Utils.java | 22 ++++ .../ripper/rippers/GonewildRipperTest.java | 31 ----- 4 files changed, 42 insertions(+), 149 deletions(-) delete mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/GonewildRipper.java delete mode 100644 src/test/java/com/rarchives/ripme/tst/ripper/rippers/GonewildRipperTest.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/GonewildRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/GonewildRipper.java deleted file mode 100644 index 402db0f2..00000000 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/GonewildRipper.java +++ /dev/null @@ -1,112 +0,0 @@ -package com.rarchives.ripme.ripper.rippers; - -import java.io.IOException; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.log4j.Logger; -import org.json.JSONArray; -import org.json.JSONObject; -import org.jsoup.Jsoup; - -import com.rarchives.ripme.ripper.AbstractRipper; -import com.rarchives.ripme.utils.Utils; - -public class GonewildRipper extends AbstractRipper { - - private static final String HOST = "gonewild"; - private static final Logger logger = Logger.getLogger(GonewildRipper.class); - private static final int SLEEP_TIME = 1000; - - private static String API_DOMAIN; - private String username; - - public GonewildRipper(URL url) throws IOException { - super(url); - API_DOMAIN = Utils.getConfigString("gw.api", "gonewild"); - } - - @Override - public boolean canRip(URL url) { - return getUsernameMatcher(url).matches(); - } - - private Matcher getUsernameMatcher(URL url) { - Pattern p = Pattern.compile("^https?://[a-z]{0,3}\\.?reddit\\.com/(u|user)/([a-zA-Z0-9\\-]{3,})/?.*$"); - return p.matcher(url.toExternalForm()); - } - - @Override - public URL sanitizeURL(URL url) throws MalformedURLException { - return url; - } - - @Override - public void rip() throws IOException { - int start = 0, - count = 50; - String baseGwURL = "http://" + API_DOMAIN + ".rarchives.com/api.cgi" - + "?method=get_user" - + "&user=" + username - + "&count=" + count; - String gwURL, jsonString, imagePath; - JSONArray posts, images; - JSONObject json, post, image; - while (true) { - logger.info(" Retrieving posts by " + username); - gwURL = baseGwURL - + "&start=" + start; - start += count; - jsonString = Jsoup.connect(gwURL) - .ignoreContentType(true) - .execute() - .body(); - json = new JSONObject(jsonString); - if (json.has("error")) { - logger.error("Error while retrieving user posts:" + json.getString("error")); - break; - } - posts = json.getJSONArray("posts"); - if (posts.length() == 0) { - break; // No more posts to get - } - for (int i = 0; i < posts.length(); i++) { - post = (JSONObject) posts.get(i); - images = post.getJSONArray("images"); - for (int j = 0; j < images.length(); j++) { - image = (JSONObject) images.get(j); - imagePath = image.getString("path"); - if (imagePath.startsWith("..")) { - imagePath = imagePath.substring(2); - } - imagePath = "http://" + API_DOMAIN + ".rarchives.com" + imagePath; - logger.info(" Found file: " + imagePath); - addURLToDownload(new URL(imagePath)); - } - } - try { - Thread.sleep(SLEEP_TIME); - } catch (InterruptedException e) { - logger.error("[!] Interrupted while waiting to load more posts", e); - break; - } - } - waitForThreads(); - } - - @Override - public String getHost() { - return HOST; - } - - @Override - public String getGID(URL url) throws MalformedURLException { - Matcher m = getUsernameMatcher(url); - if (m.matches()) { - this.username = m.group(m.groupCount()); - } - return username; - } -} diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/RedditRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/RedditRipper.java index f8011ad2..a6c03458 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/RedditRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/RedditRipper.java @@ -16,6 +16,8 @@ import org.jsoup.nodes.Document; import com.rarchives.ripme.ripper.AbstractRipper; import com.rarchives.ripme.utils.RipUtils; +import com.rarchives.ripme.utils.Utils; +import java.net.SocketTimeoutException; public class RedditRipper extends AbstractRipper { @@ -26,7 +28,7 @@ public class RedditRipper extends AbstractRipper { private static final String HOST = "reddit"; private static final String DOMAIN = "reddit.com"; - private static final Logger logger = Logger.getLogger(GonewildRipper.class); + private static final Logger logger = Logger.getLogger(RedditRipper.class); private static final int SLEEP_TIME = 2000; //private static final String USER_AGENT = "ripme by /u/4_pr0n github.com/4pr0n/ripme"; @@ -67,6 +69,8 @@ public class RedditRipper extends AbstractRipper { waitForThreads(); } + + private URL getAndParseAndReturnNext(URL url) throws IOException { JSONArray jsonArray = getJsonArrayFromURL(url), children; JSONObject json, data; @@ -85,7 +89,7 @@ public class RedditRipper extends AbstractRipper { parseJsonChild(children.getJSONObject(j)); } if (data.has("after") && !data.isNull("after")) { - String nextURLString = url.toExternalForm(); + String nextURLString = Utils.stripURLParameter(url.toExternalForm(), "after"); if (nextURLString.contains("?")) { nextURLString = nextURLString.concat("&after=" + data.getString("after")); } @@ -111,11 +115,21 @@ public class RedditRipper extends AbstractRipper { } lastRequestTime = System.currentTimeMillis(); + int attempts = 0; + Document doc = null; logger.info(" Retrieving " + url); - Document doc= Jsoup.connect(url.toExternalForm()) - .ignoreContentType(true) - .userAgent(USER_AGENT) - .get(); + while(doc == null && attempts++ < 3) { + try { + doc= Jsoup.connect(url.toExternalForm()) + .ignoreContentType(true) + .userAgent(USER_AGENT) + .get(); + } catch(SocketTimeoutException ex) { + if(attempts >= 3) throw ex; + logger.warn(String.format("[!] Connection timed out (attempt %d)", attempts)); + } + } + String jsonString = doc.body().html().replaceAll(""", "\""); Object jsonObj = new JSONTokener(jsonString).nextValue(); diff --git a/src/main/java/com/rarchives/ripme/utils/Utils.java b/src/main/java/com/rarchives/ripme/utils/Utils.java index 8124c981..98039c26 100644 --- a/src/main/java/com/rarchives/ripme/utils/Utils.java +++ b/src/main/java/com/rarchives/ripme/utils/Utils.java @@ -83,6 +83,28 @@ public class Utils { } return prettySaveAs; } + + public static String stripURLParameter(String url, String parameter) { + int paramIndex = url.indexOf("?" + parameter); + boolean wasFirstParam = true; + if(paramIndex < 0) { + wasFirstParam = false; + paramIndex = url.indexOf("&" + parameter); + } + + if(paramIndex > 0) { + int nextParam = url.indexOf("&", paramIndex+1); + if(nextParam != -1) { + String c = "&"; + if(wasFirstParam) c = "?"; + url = url.substring(0, paramIndex) + c + url.substring(nextParam+1, url.length()); + } else { + url = url.substring(0, paramIndex); + } + } + + return url; + } /** * Removes the current working directory from a given filename diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/GonewildRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/GonewildRipperTest.java deleted file mode 100644 index f9f748ae..00000000 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/GonewildRipperTest.java +++ /dev/null @@ -1,31 +0,0 @@ -package com.rarchives.ripme.tst.ripper.rippers; - -import java.io.IOException; -import java.net.URL; -import java.util.ArrayList; -import java.util.List; - -import com.rarchives.ripme.ripper.rippers.GonewildRipper; - -public class GonewildRipperTest extends RippersTest { - - public void testInstagramAlbums() throws IOException { - if (!DOWNLOAD_CONTENT) { - return; - } - List contentURLs = new ArrayList(); - contentURLs.add(new URL("http://reddit.com/u/amle69")); - for (URL url : contentURLs) { - try { - GonewildRipper ripper = new GonewildRipper(url); - ripper.rip(); - assert(ripper.getWorkingDir().listFiles().length > 1); - deleteDir(ripper.getWorkingDir()); - } catch (Exception e) { - e.printStackTrace(); - fail("Error while ripping URL " + url + ": " + e.getMessage()); - } - } - } - -}