From 16e0d27f66e714f598b86e39576004ed41a6c022 Mon Sep 17 00:00:00 2001 From: Wiiplay123 Date: Fri, 28 Nov 2014 22:59:39 -0600 Subject: [PATCH] Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. --- .../ripme/ripper/AbstractHTMLRipper.java | 75 +++++++++++++++++-- .../ripme/ripper/AbstractRipper.java | 42 +---------- .../ripper/rippers/DeviantartRipper.java | 57 +++++++++++++- 3 files changed, 124 insertions(+), 50 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java index be73c717..6953704c 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java @@ -1,5 +1,7 @@ package com.rarchives.ripme.ripper; +import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; @@ -14,7 +16,7 @@ import com.rarchives.ripme.utils.Utils; * Simplified ripper, designed for ripping from sites by parsing HTML. */ public abstract class AbstractHTMLRipper extends AlbumRipper { - + public AbstractHTMLRipper(URL url) throws IOException { super(url); } @@ -27,6 +29,9 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { throw new IOException("getNextPage not implemented"); } public abstract List getURLsFromPage(Document page); + public List getDescriptionsFromPage(Document doc) throws IOException { + throw new IOException("getDescriptionsFromPage not implemented"); // Do I do this or make an abstract function? + } public abstract void downloadURL(URL url, int index); public DownloadThreadPool getThreadPool() { return null; @@ -45,21 +50,27 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { public URL sanitizeURL(URL url) throws MalformedURLException { return url; } - + public boolean hasDescriptionSupport() { + return false; + } + public String getDescription(String page) throws IOException { + throw new IOException("getDescription not implemented"); // Do I do this or make an abstract function? + } @Override public void rip() throws IOException { int index = 0; + int textindex = 0; logger.info("Retrieving " + this.url); sendUpdate(STATUS.LOADING_RESOURCE, this.url.toExternalForm()); Document doc = getFirstPage(); - + while (doc != null) { List imageURLs = getURLsFromPage(doc); if (imageURLs.size() == 0) { throw new IOException("No images found at " + doc.location()); } - + for (String imageURL : imageURLs) { if (isStopped()) { break; @@ -67,6 +78,21 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { index += 1; downloadURL(new URL(imageURL), index); } + if (hasDescriptionSupport()) { + List textURLs = getDescriptionsFromPage(doc); + if (textURLs.size() > 0) { + for (String textURL : textURLs) { + if (isStopped()) { + break; + } + textindex += 1; + String tempDesc = getDescription(textURL); + if (tempDesc != null) { + saveText(new URL(textURL), "", tempDesc, textindex); + } + } + } + } if (isStopped()) { break; @@ -87,7 +113,46 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { } waitForThreads(); } - + public boolean saveText(URL url, String subdirectory, String text, int index) { + try { + stopCheck(); + } catch (IOException e) { + return false; + } + String saveAs = url.toExternalForm(); + saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1); + if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); } + if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); } + if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); } + if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); } + File saveFileAs; + try { + if (!subdirectory.equals("")) { // Not sure about this part + subdirectory = File.separator + subdirectory; + } + // TODO Get prefix working again, probably requires reworking a lot of stuff! + saveFileAs = new File( + workingDir.getCanonicalPath() + + subdirectory + + File.separator + + getPrefix(index) + + saveAs + + ".txt"); + // Write the file + FileOutputStream out = (new FileOutputStream(saveFileAs)); + out.write(text.getBytes()); + out.close(); + } catch (IOException e) { + logger.error("[!] Error creating save file path for description '" + url + "':", e); + return false; + } + logger.debug("Downloading " + url + "'s description to " + saveFileAs); + if (!saveFileAs.getParentFile().exists()) { + logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent())); + saveFileAs.getParentFile().mkdirs(); + } + return true; + } public String getPrefix(int index) { String prefix = ""; if (keepSortOrder() && Utils.getConfigBoolean("download.save_order", true)) { diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java index 71cfe86e..33b5fd45 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java @@ -2,7 +2,6 @@ package com.rarchives.ripme.ripper; import java.awt.Desktop; import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; import java.lang.reflect.Constructor; import java.net.MalformedURLException; @@ -132,46 +131,7 @@ public abstract class AbstractRipper } return addURLToDownload(url, saveFileAs, referrer, cookies); } - public boolean saveText(URL url, String subdirectory, String referrer, Map cookies, String text) { - try { - stopCheck(); - } catch (IOException e) { - return false; - } - String saveAs = url.toExternalForm(); - saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1); - if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); } - if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); } - if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); } - if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); } - File saveFileAs; - try { - if (!subdirectory.equals("")) { - subdirectory = File.separator + subdirectory; - } - // TODO Get prefix working again, probably requires reworking a lot of stuff! - saveFileAs = new File( - workingDir.getCanonicalPath() - + subdirectory - // + prefix - + File.separator - + saveAs - + ".txt"); - // Write the file - FileOutputStream out = (new FileOutputStream(saveFileAs)); - out.write(text.getBytes()); - out.close(); - } catch (IOException e) { - logger.error("[!] Error creating save file path for description '" + url + "':", e); - return false; - } - logger.debug("Downloading " + url + "'s description to " + saveFileAs); - if (!saveFileAs.getParentFile().exists()) { - logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent())); - saveFileAs.getParentFile().mkdirs(); - } - return true; - } + /** * Queues file to be downloaded and saved. With options. diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java index 2c88c690..4e9c51c8 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java @@ -43,7 +43,10 @@ public class DeviantartRipper extends AbstractHTMLRipper { public String getDomain() { return "deviantart.com"; } - + @Override + public boolean hasDescriptionSupport() { + return true; + } @Override public URL sanitizeURL(URL url) throws MalformedURLException { String u = url.toExternalForm(); @@ -118,7 +121,6 @@ public class DeviantartRipper extends AbstractHTMLRipper { logger.info("Attempting to get full size image from " + thumb.attr("href")); fullSize = smallToFull(img.attr("src"), thumb.attr("href")); } - if (fullSize == null) { continue; } @@ -131,7 +133,23 @@ public class DeviantartRipper extends AbstractHTMLRipper { } return imageURLs; } - + @Override + public List getDescriptionsFromPage(Document page) { + List textURLs = new ArrayList(); + + // Iterate over all thumbnails + for (Element thumb : page.select("div.zones-container a.thumb")) { + if (isStopped()) { + break; + } + Element img = thumb.select("img").get(0); + if (img.attr("transparent").equals("false")) { + continue; // a.thumbs to other albums are invisible + } + textURLs.add(thumb.attr("href")); + } + return textURLs; + } @Override public Document getNextPage(Document page) throws IOException { Elements nextButtons = page.select("li.next > a"); @@ -184,7 +202,38 @@ public class DeviantartRipper extends AbstractHTMLRipper { } return result.toString(); } + + /** + * Attempts to download description for image. + * Comes in handy when people put entire stories in their description. + * If no description was found, returns null. + * @param page The page the description will be retrieved from + * @return The description + */ + @Override + public String getDescription(String page) { + try { + // Fetch the image page + Response resp = Http.url(page) + .referrer(this.url) + .cookies(cookies) + .response(); + cookies.putAll(resp.cookies()); + // Try to find the "Download" box + Elements els = resp.parse().select("div[class=dev-description]"); + if (els.size() == 0) { + throw new IOException("No description found"); + } + // Full-size image + String desc = els.text(); // TODO Figure out how to preserve newlines + return desc; + } catch (IOException ioe) { + logger.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'"); + return null; + } + } + /** * If largest resolution for image at 'thumb' is found, starts downloading * and returns null. @@ -202,7 +251,7 @@ public class DeviantartRipper extends AbstractHTMLRipper { .response(); cookies.putAll(resp.cookies()); - // Try to find the "Download" box + // Try to find the description Elements els = resp.parse().select("a.dev-page-download"); if (els.size() == 0) { throw new IOException("No download page found");