From 04ce12b546fe775101cc880cb072c3c284014626 Mon Sep 17 00:00:00 2001 From: 4pr0n Date: Fri, 28 Feb 2014 03:04:03 -0800 Subject: [PATCH] More imgur ripping support. Better logging. Can rip imgur account pages, series of imgur images (separated by commas). Also gets highest-res imgur images. Logging is less verbose and more human-readable. --- .../ripme/ripper/AbstractRipper.java | 71 ++++++++---- .../ripme/ripper/DownloadFileThread.java | 10 +- .../ripme/ripper/DownloadThreadPool.java | 1 - .../ripme/ripper/rippers/ImgurRipper.java | 106 ++++++++++++++++-- .../java/com/rarchives/ripme/utils/Utils.java | 23 ++++ 5 files changed, 178 insertions(+), 33 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java index d398d187..336f6256 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java @@ -42,28 +42,32 @@ public abstract class AbstractRipper implements RipperInterface { this.threadPool = new DownloadThreadPool(); } + /** + * Queues image to be downloaded and saved. + * Uses filename from URL to decide filename. + * @param url + * URL to download + */ public void addURLToDownload(URL url) { - addURLToDownload(url, ""); + // Use empty prefix and empty subdirectory + addURLToDownload(url, "", ""); } - public void addURLToDownload(URL url, String prefix) { - String saveAs = url.toExternalForm(); - saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1); - if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); } - if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); } - if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); } - File saveFileAs; - try { - saveFileAs = new File(workingDir.getCanonicalPath() + File.separator + prefix + saveAs); - } catch (IOException e) { - logger.error("[!] Error creating save file path for URL '" + url + "':", e); - return; - } - logger.debug("Downloading " + url + " to " + saveFileAs); - addURLToDownload(url, saveFileAs); - } /** - * Add image to be downloaded and saved. + * Queues image to be downloaded and saved. + * Uses filename from URL (and 'prefix') to decide filename. + * @param url + * URL to download + * @param prefix + * Text to append to saved filename. + */ + public void addURLToDownload(URL url, String prefix) { + // Use empty subdirectory + addURLToDownload(url, prefix, ""); + } + + /** + * Queues image to be downloaded and saved. * @param url * URL of the file * @param saveAs @@ -73,6 +77,35 @@ public abstract class AbstractRipper implements RipperInterface { threadPool.addThread(new DownloadFileThread(url, saveAs)); } + public void addURLToDownload(URL url, String prefix, String subdirectory) { + String saveAs = url.toExternalForm(); + saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1); + if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); } + if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); } + if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); } + File saveFileAs; + try { + if (!subdirectory.equals("")) { + subdirectory = File.separator + subdirectory; + } + saveFileAs = new File( + workingDir.getCanonicalPath() + + subdirectory + + File.separator + + prefix + + saveAs); + } catch (IOException e) { + logger.error("[!] Error creating save file path for URL '" + url + "':", e); + return; + } + logger.debug("Downloading " + url + " to " + saveFileAs); + if (!saveFileAs.getParentFile().exists()) { + logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent())); + saveFileAs.getParentFile().mkdirs(); + } + addURLToDownload(url, saveFileAs); + } + public URL getURL() { return url; } @@ -85,7 +118,7 @@ public abstract class AbstractRipper implements RipperInterface { path += getHost() + "_" + getGID(this.url) + File.separator; this.workingDir = new File(path); if (!this.workingDir.exists()) { - logger.info("[+] Creating working directory: " + this.workingDir); + logger.info("[+] Creating directory: " + Utils.removeCWD(this.workingDir)); this.workingDir.mkdirs(); } logger.debug("Set working directory to: " + this.workingDir); diff --git a/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java b/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java index 820a12b3..8ab3654f 100644 --- a/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java +++ b/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java @@ -17,12 +17,14 @@ public class DownloadFileThread extends Thread { private URL url; private File saveAs; + private String prettySaveAs; private int retries; public DownloadFileThread(URL url, File saveAs) { super(); this.url = url; this.saveAs = saveAs; + this.prettySaveAs = Utils.removeCWD(saveAs); this.retries = Utils.getConfigInteger("download.retries", 1); } @@ -30,10 +32,10 @@ public class DownloadFileThread extends Thread { // Check if file already exists if (saveAs.exists()) { if (Utils.getConfigBoolean("file.overwrite", false)) { - logger.info("[!] File already exists and 'file.overwrite' is true, deleting: " + saveAs); + logger.info("[!] File already exists and 'file.overwrite' is true, deleting: " + prettySaveAs); saveAs.delete(); } else { - logger.info("[!] Not downloading " + url + " because file already exists: " + saveAs); + logger.info("[!] Skipping " + url + " -- file already exists: " + prettySaveAs); return; } } @@ -41,7 +43,7 @@ public class DownloadFileThread extends Thread { int tries = 0; // Number of attempts to download do { try { - logger.info("[ ] Downloading file from: " + url + (tries > 0 ? " Retry #" + tries : "")); + logger.info(" Downloading file: " + url + (tries > 0 ? " Retry #" + tries : "")); tries += 1; Response response; response = Jsoup.connect(url.toExternalForm()) @@ -59,7 +61,7 @@ public class DownloadFileThread extends Thread { return; } } while (true); - logger.info("[+] Download completed: " + url); + logger.info("[+] Saved " + url + " as " + this.prettySaveAs); } } \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java b/src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java index 1c5a0ac6..e0a93404 100644 --- a/src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java +++ b/src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java @@ -24,7 +24,6 @@ public class DownloadThreadPool { } public void waitForThreads() { - logger.info("[ ] Waiting for threads to finish..."); threadPool.shutdown(); try { threadPool.awaitTermination(60, TimeUnit.SECONDS); diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java index 2e8a5406..65ebd7a4 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java @@ -7,6 +7,9 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -19,6 +22,8 @@ public class ImgurRipper extends AbstractRipper { HOST = "imgur"; private static final Logger logger = Logger.getLogger(ImgurRipper.class); + private final int SLEEP_BETWEEN_ALBUMS; + static enum ALBUM_TYPE { ALBUM, USER, @@ -29,11 +34,12 @@ public class ImgurRipper extends AbstractRipper { public ImgurRipper(URL url) throws IOException { super(url); + SLEEP_BETWEEN_ALBUMS = 1; } - public void processURL(URL url, String prefix) { + public void processURL(URL url, String prefix, String subdirectory) { logger.debug("Found URL: " + url); - addURLToDownload(url, prefix); + addURLToDownload(url, prefix, subdirectory); } public boolean canRip(URL url) { @@ -61,27 +67,83 @@ public class ImgurRipper extends AbstractRipper { public void rip() throws IOException { switch (albumType) { case ALBUM: - this.url = new URL(this.url.toExternalForm() + "/noscript"); + this.url = new URL(this.url.toExternalForm()); // Fall-through case USER_ALBUM: ripAlbum(this.url); break; case SERIES_OF_IMAGES: - // TODO Get all images + ripAlbum(this.url); break; case USER: // TODO Get all albums by user + ripUserAccount(url); break; } threadPool.waitForThreads(); } private void ripAlbum(URL url) throws IOException { + ripAlbum(url, ""); + } + + private void ripAlbum(URL url, String subdirectory) throws IOException { int index = 0; - logger.info("[ ] Retrieving " + url.toExternalForm()); + logger.info(" Retrieving " + url.toExternalForm()); Document doc = Jsoup.connect(url.toExternalForm()).get(); + + // Try to use embedded JSON to retrieve images + Pattern p = Pattern.compile("^.*Imgur\\.Album\\.getInstance\\((.*)\\);.*$", Pattern.DOTALL); + Matcher m = p.matcher(doc.body().html()); + if (m.matches()) { + try { + JSONObject json = new JSONObject(m.group(1)); + JSONArray images = json.getJSONObject("images").getJSONArray("items"); + int imagesLength = images.length(); + for (int i = 0; i < imagesLength; i++) { + JSONObject image = images.getJSONObject(i); + URL imageURL = new URL( + // CDN url is provided elsewhere in the document + "http://i.imgur.com/" + + image.get("hash") + + image.get("ext")); + index += 1; + processURL(imageURL, String.format("%03d_", index), subdirectory); + } + return; + } catch (JSONException e) { + logger.debug("Error while parsing JSON at " + url + ", continuing", e); + } + } + p = Pattern.compile("^.*= new ImgurShare\\((.*)\\);.*$", Pattern.DOTALL); + m = p.matcher(doc.body().html()); + if (m.matches()) { + try { + JSONObject json = new JSONObject(m.group(1)); + JSONArray images = json.getJSONArray("hashes"); + int imagesLength = images.length(); + for (int i = 0; i < imagesLength; i++) { + JSONObject image = images.getJSONObject(i); + URL imageURL = new URL( + "http:" + json.get("cdnUrl") + + "/" + + image.get("hash") + + image.get("ext")); + index += 1; + processURL(imageURL, String.format("%03d_", index), subdirectory); + } + return; + } catch (JSONException e) { + logger.debug("Error while parsing JSON at " + url + ", continuing", e); + } + } + + logger.info("[!] Falling back to elemental retrieval method"); + + // Fall back to parsing HTML elements + // NOTE: This does not always get the highest-resolution images! for (Element thumb : doc.select("div.image")) { String image; if (thumb.select("a.zoom").size() > 0) { @@ -95,7 +157,33 @@ public class ImgurRipper extends AbstractRipper { continue; } index += 1; - processURL(new URL(image), String.format("%03d_", index)); + processURL(new URL(image), String.format("%03d_", index), subdirectory); + } + } + + /** + * Rips all albums in an imgur user's account. + * @param url + * URL to imgur user account (http://username.imgur.com) + * @throws IOException + */ + private void ripUserAccount(URL url) throws IOException { + logger.info("[ ] Retrieving " + url.toExternalForm()); + Document doc = Jsoup.connect(url.toExternalForm()).get(); + for (Element album : doc.select("div.cover a")) { + if (!album.hasAttr("href") + || !album.attr("href").contains("imgur.com/a/")) { + continue; + } + String albumID = album.attr("href").substring(album.attr("href").lastIndexOf('/') + 1); + URL albumURL = new URL("http:" + album.attr("href") + "/noscript"); + try { + ripAlbum(albumURL, albumID); + Thread.sleep(SLEEP_BETWEEN_ALBUMS * 1000); + } catch (Exception e) { + logger.error("Error while ripping album: " + e.getMessage(), e); + continue; + } } } @@ -115,12 +203,12 @@ public class ImgurRipper extends AbstractRipper { this.url = new URL("http://imgur.com/a/" + gid); return gid; } - p = Pattern.compile("^https?://([a-zA-Z0-9\\-])\\.imgur\\.com/?$"); + p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.imgur\\.com/?$"); m = p.matcher(url.toExternalForm()); if (m.matches()) { // Root imgur account albumType = ALBUM_TYPE.USER; - return m.group(m.groupCount()); + return m.group(1); } p = Pattern.compile("^https?://([a-zA-Z0-9\\-])\\.imgur\\.com/([a-zA-Z0-9])?$"); m = p.matcher(url.toExternalForm()); @@ -134,7 +222,7 @@ public class ImgurRipper extends AbstractRipper { if (m.matches()) { // Series of imgur images albumType = ALBUM_TYPE.SERIES_OF_IMAGES; - return m.group(); + return m.group(m.groupCount()).replaceAll(",", "-"); } throw new MalformedURLException("Unexpected URL format: " + url.toExternalForm()); } diff --git a/src/main/java/com/rarchives/ripme/utils/Utils.java b/src/main/java/com/rarchives/ripme/utils/Utils.java index 2c8e917a..623b492d 100644 --- a/src/main/java/com/rarchives/ripme/utils/Utils.java +++ b/src/main/java/com/rarchives/ripme/utils/Utils.java @@ -49,4 +49,27 @@ public class Utils { config.setProperty(key, value); } + /** + * Removes the current working directory (CWD) from a File. + * @param saveAs + * The File path + * @return + * saveAs in relation to the CWD + */ + public static String removeCWD(File saveAs) { + String prettySaveAs; + try { + String cwd = new File(".").getCanonicalPath() + File.separator; + prettySaveAs = saveAs.getCanonicalPath().replace( + cwd, + ""); + } catch (Exception e) { + prettySaveAs = saveAs.toString(); + } + return prettySaveAs; + } + + public static String removeCWD(String file) { + return removeCWD(new File(file)); + } } \ No newline at end of file