From c8cfc57e5bce399befa51de5a0e508065c1c4bf7 Mon Sep 17 00:00:00 2001 From: 4pr0n Date: Tue, 11 Mar 2014 01:29:59 -0700 Subject: [PATCH] Reddit support --- .../ripme/ripper/rippers/RedditRipper.java | 217 ++++++++++++++++++ .../com/rarchives/ripme/utils/RipUtils.java | 50 ++++ .../tst/ripper/rippers/RedditRipperTest.java | 34 +++ 3 files changed, 301 insertions(+) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/RedditRipper.java create mode 100644 src/main/java/com/rarchives/ripme/utils/RipUtils.java create mode 100644 src/test/java/com/rarchives/ripme/tst/ripper/rippers/RedditRipperTest.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/RedditRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/RedditRipper.java new file mode 100644 index 00000000..f8011ad2 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/RedditRipper.java @@ -0,0 +1,217 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.log4j.Logger; +import org.json.JSONArray; +import org.json.JSONObject; +import org.json.JSONTokener; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import com.rarchives.ripme.ripper.AbstractRipper; +import com.rarchives.ripme.utils.RipUtils; + +public class RedditRipper extends AbstractRipper { + + public RedditRipper(URL url) throws IOException { + super(url); + } + + private static final String HOST = "reddit"; + private static final String DOMAIN = "reddit.com"; + + private static final Logger logger = Logger.getLogger(GonewildRipper.class); + private static final int SLEEP_TIME = 2000; + + //private static final String USER_AGENT = "ripme by /u/4_pr0n github.com/4pr0n/ripme"; + + private long lastRequestTime = 0; + + @Override + public boolean canRip(URL url) { + return url.getHost().endsWith(DOMAIN); + } + + @Override + public URL sanitizeURL(URL url) throws MalformedURLException { + String u = url.toExternalForm(); + // Strip '/u/' from URL + u = u.replaceAll("reddit\\.com/u/", "reddit.com/user/"); + return new URL(u); + } + + private URL getJsonURL(URL url) throws MalformedURLException { + // Append ".json" to URL in appropriate location. + String result = url.getProtocol() + "://" + url.getHost() + url.getPath() + ".json"; + if (url.getQuery() != null) { + result += "?" + url.getQuery(); + } + return new URL(result); + } + + @Override + public void rip() throws IOException { + URL jsonURL = getJsonURL(this.url); + while (true) { + jsonURL = getAndParseAndReturnNext(jsonURL); + if (jsonURL == null) { + break; + } + } + waitForThreads(); + } + + private URL getAndParseAndReturnNext(URL url) throws IOException { + JSONArray jsonArray = getJsonArrayFromURL(url), children; + JSONObject json, data; + URL nextURL = null; + for (int i = 0; i < jsonArray.length(); i++) { + json = jsonArray.getJSONObject(i); + if (!json.has("data")) { + continue; + } + data = json.getJSONObject("data"); + if (!data.has("children")) { + continue; + } + children = data.getJSONArray("children"); + for (int j = 0; j < children.length(); j++) { + parseJsonChild(children.getJSONObject(j)); + } + if (data.has("after") && !data.isNull("after")) { + String nextURLString = url.toExternalForm(); + if (nextURLString.contains("?")) { + nextURLString = nextURLString.concat("&after=" + data.getString("after")); + } + else { + nextURLString = nextURLString.concat("?after=" + data.getString("after")); + } + nextURL = new URL(nextURLString); + } + } + return nextURL; + } + + private JSONArray getJsonArrayFromURL(URL url) throws IOException { + // Wait 2 seconds before the next request + long timeDiff = System.currentTimeMillis() - lastRequestTime; + if (timeDiff < SLEEP_TIME) { + try { + Thread.sleep(timeDiff); + } catch (InterruptedException e) { + logger.warn("[!] Interrupted while waiting to load next page", e); + return new JSONArray(); + } + } + lastRequestTime = System.currentTimeMillis(); + + logger.info(" Retrieving " + url); + Document doc= Jsoup.connect(url.toExternalForm()) + .ignoreContentType(true) + .userAgent(USER_AGENT) + .get(); + String jsonString = doc.body().html().replaceAll(""", "\""); + + Object jsonObj = new JSONTokener(jsonString).nextValue(); + JSONArray jsonArray = new JSONArray(); + if (jsonObj instanceof JSONObject) { + jsonArray.put( (JSONObject) jsonObj); + } else if (jsonObj instanceof JSONArray){ + jsonArray = (JSONArray) jsonObj; + } else { + logger.warn("[!] Unable to parse child: " + jsonString); + } + return jsonArray; + } + + private void parseJsonChild(JSONObject child) { + String kind = child.getString("kind"); + JSONObject data = child.getJSONObject("data"); + if (kind.equals("t1")) { + // Comment + handleBody(data.getString("body"), data.getString("id")); + } + else if (kind.equals("t3")) { + // post + if (data.getBoolean("is_self")) { + // TODO Parse self text + handleBody(data.getString("selftext"), data.getString("id")); + } else { + // Get link + handleURL(data.getString("url"), data.getString("id")); + } + if (data.has("replies") && data.get("replies") instanceof JSONObject) { + JSONArray replies = data.getJSONObject("replies") + .getJSONObject("data") + .getJSONArray("children"); + for (int i = 0; i < replies.length(); i++) { + parseJsonChild(replies.getJSONObject(i)); + } + } + } + } + + public void handleBody(String body, String id) { + Pattern p = RipUtils.getURLRegex(); + Matcher m = p.matcher(body); + while (m.find()) { + handleURL(m.group(1), id); + } + } + + public void handleURL(String theUrl, String id) { + URL originalURL; + try { + originalURL = new URL(theUrl); + } catch (MalformedURLException e) { + return; + } + + List urls = RipUtils.getFilesFromURL(originalURL); + if (urls.size() == 1) { + addURLToDownload(urls.get(0), id + "-"); + } else if (urls.size() > 1) { + for (int i = 0; i < urls.size(); i++) { + addURLToDownload(urls.get(i), id + String.format("-%03d-", i + 1)); + } + } + } + + @Override + public String getHost() { + return HOST; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + // User + Pattern p = Pattern.compile("^https?://[a-zA-Z0-9\\.]{0,4}reddit\\.com/(user|u)/([a-zA-Z0-9_\\-]{3,}).*$"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return "user_" + m.group(m.groupCount()); + } + + // Post + p = Pattern.compile("^https?://[a-zA-Z0-9\\.]{0,4}reddit\\.com/.*comments/([a-zA-Z0-9]{1,8}).*$"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return "post_" + m.group(m.groupCount()); + } + + // Subreddit + p = Pattern.compile("^https?://[a-zA-Z0-9\\.]{0,4}reddit\\.com/r/([a-zA-Z0-9_]{1,}).*$"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return "sub_" + m.group(m.groupCount()); + } + + throw new MalformedURLException("Only accepts user pages, subreddits, or post, can't understand " + url); + } + +} diff --git a/src/main/java/com/rarchives/ripme/utils/RipUtils.java b/src/main/java/com/rarchives/ripme/utils/RipUtils.java new file mode 100644 index 00000000..9a055e0e --- /dev/null +++ b/src/main/java/com/rarchives/ripme/utils/RipUtils.java @@ -0,0 +1,50 @@ +package com.rarchives.ripme.utils; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.log4j.Logger; + +import com.rarchives.ripme.ripper.rippers.ImgurRipper; + +public class RipUtils { + private static final Logger logger = Logger.getLogger(RipUtils.class); + + public static List getFilesFromURL(URL url) { + List result = new ArrayList(); + + // Imgur album + if (url.getHost().equals("imgur.com") && url.toExternalForm().contains("imgur.com/a/")) { + try { + return ImgurRipper.getURLsFromAlbum(url); + } catch (IOException e) { + logger.error("[!] Exception while loading album " + url, e); + } + } + + // Direct link to image + Pattern p = Pattern.compile("(https?://[a-zA-Z0-9\\-\\.]+\\.[a-zA-Z]{2,3}(/\\S*)\\.(jpg|jpeg|gif|png|mp4))"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + try { + URL singleURL = new URL(m.group(1)); + result.add(singleURL); + return result; + } catch (MalformedURLException e) { + logger.error("[!] Not a valid URL: '" + url + "'", e); + } + } + + logger.error("[!] Unable to rip URL: " + url); + return result; + } + + public static Pattern getURLRegex() { + return Pattern.compile("(https?://[a-zA-Z0-9\\-\\.]+\\.[a-zA-Z]{2,3}(/\\S*))"); + } +} diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RedditRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RedditRipperTest.java new file mode 100644 index 00000000..d478a4b9 --- /dev/null +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RedditRipperTest.java @@ -0,0 +1,34 @@ +package com.rarchives.ripme.tst.ripper.rippers; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import com.rarchives.ripme.ripper.rippers.RedditRipper; + +public class RedditRipperTest extends RippersTest { + + public void testRedditAlbums() throws IOException { + if (false && !DOWNLOAD_CONTENT) { + return; + } + List contentURLs = new ArrayList(); + //contentURLs.add(new URL("http://www.reddit.com/r/nsfw_oc")); + //contentURLs.add(new URL("http://www.reddit.com/r/nsfw_oc/top?t=all")); + //contentURLs.add(new URL("http://www.reddit.com/u/gingerpuss")); + contentURLs.add(new URL("http://www.reddit.com/r/UnrealGirls/comments/1ziuhl/in_class_veronique_popa/")); + for (URL url : contentURLs) { + try { + RedditRipper ripper = new RedditRipper(url); + ripper.rip(); + assert(ripper.getWorkingDir().listFiles().length > 1); + deleteDir(ripper.getWorkingDir()); + } catch (Exception e) { + e.printStackTrace(); + fail("Error while ripping URL " + url + ": " + e.getMessage()); + } + } + } + +}