Improved description ripping for deviantART

Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper.
2014-11-28 22:59:39 -06:00 · 2014-11-28 22:59:39 -06:00 · 16e0d27f66
commit 16e0d27f66
parent 72b40394ae
3 changed files with 124 additions and 50 deletions
--- a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java
@ -1,5 +1,7 @@
 package com.rarchives.ripme.ripper;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
@ -27,6 +29,9 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
        throw new IOException("getNextPage not implemented");
    }
    public abstract List<String> getURLsFromPage(Document page);
    public List<String> getDescriptionsFromPage(Document doc) throws IOException {
    	throw new IOException("getDescriptionsFromPage not implemented"); // Do I do this or make an abstract function?
    }
    public abstract void downloadURL(URL url, int index);
    public DownloadThreadPool getThreadPool() {
        return null;
@ -45,10 +50,16 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
    public URL sanitizeURL(URL url) throws MalformedURLException {
        return url;
    }
-
+    public boolean hasDescriptionSupport() {
 		return false;
    }
    public String getDescription(String page) throws IOException {
    	throw new IOException("getDescription not implemented"); // Do I do this or make an abstract function?
    }
    @Override
    public void rip() throws IOException {
        int index = 0;
        int textindex = 0;
        logger.info("Retrieving " + this.url);
        sendUpdate(STATUS.LOADING_RESOURCE, this.url.toExternalForm());
        Document doc = getFirstPage();
@ -67,6 +78,21 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
                index += 1;
                downloadURL(new URL(imageURL), index);
            }
            if (hasDescriptionSupport()) {
            	List<String> textURLs = getDescriptionsFromPage(doc);
            	if (textURLs.size() > 0) {
            		for (String textURL : textURLs) {
            			if (isStopped()) {
            				break;
            			}
            			textindex += 1;
            			String tempDesc = getDescription(textURL);
            			if (tempDesc != null) {
            				saveText(new URL(textURL), "", tempDesc, textindex);
            			}
            		}
            	}
            }
            if (isStopped()) {
                break;
@ -87,7 +113,46 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
        }
        waitForThreads();
    }
-
+    public boolean saveText(URL url, String subdirectory, String text, int index) {
        try {
            stopCheck();
        } catch (IOException e) {
            return false;
        }
        String saveAs = url.toExternalForm();
        saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1);
        if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); }
        if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); }
        if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); }
        if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); }
        File saveFileAs;
        try {
            if (!subdirectory.equals("")) { // Not sure about this part
                subdirectory = File.separator + subdirectory;
            }
            // TODO Get prefix working again, probably requires reworking a lot of stuff!
            saveFileAs = new File(
                    workingDir.getCanonicalPath()
                    + subdirectory
                    + File.separator
                    + getPrefix(index)
                    + saveAs
                    + ".txt");
            // Write the file
            FileOutputStream out = (new FileOutputStream(saveFileAs));
            out.write(text.getBytes());
            out.close();
        } catch (IOException e) {
            logger.error("[!] Error creating save file path for description '" + url + "':", e);
            return false;
        }
        logger.debug("Downloading " + url + "'s description to " + saveFileAs);
        if (!saveFileAs.getParentFile().exists()) {
            logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent()));
            saveFileAs.getParentFile().mkdirs();
        }
        return true;
    }
    public String getPrefix(int index) {
        String prefix = "";
        if (keepSortOrder() && Utils.getConfigBoolean("download.save_order", true)) {
--- a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java
@ -2,7 +2,6 @@ package com.rarchives.ripme.ripper;
 import java.awt.Desktop;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.lang.reflect.Constructor;
 import java.net.MalformedURLException;
@ -132,46 +131,7 @@ public abstract class AbstractRipper
        }
        return addURLToDownload(url, saveFileAs, referrer, cookies);
    }
-    public boolean saveText(URL url, String subdirectory, String referrer, Map<String,String> cookies, String text) {
+    
        try {
            stopCheck();
        } catch (IOException e) {
            return false;
        }
        String saveAs = url.toExternalForm();
        saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1);
        if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); }
        if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); }
        if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); }
        if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); }
        File saveFileAs;
        try {
            if (!subdirectory.equals("")) {
                subdirectory = File.separator + subdirectory;
            }
            // TODO Get prefix working again, probably requires reworking a lot of stuff!
            saveFileAs = new File(
                    workingDir.getCanonicalPath()
                    + subdirectory
                    // + prefix
                    + File.separator
                    + saveAs
                    + ".txt");
            // Write the file
            FileOutputStream out = (new FileOutputStream(saveFileAs));
            out.write(text.getBytes());
            out.close();
        } catch (IOException e) {
            logger.error("[!] Error creating save file path for description '" + url + "':", e);
            return false;
        }
        logger.debug("Downloading " + url + "'s description to " + saveFileAs);
        if (!saveFileAs.getParentFile().exists()) {
            logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent()));
            saveFileAs.getParentFile().mkdirs();
        }
        return true;
    }
    /**
     * Queues file to be downloaded and saved. With options.
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java
@ -43,7 +43,10 @@ public class DeviantartRipper extends AbstractHTMLRipper {
    public String getDomain() {
        return "deviantart.com";
    }
-
+    @Override
    public boolean hasDescriptionSupport() {
 		return true;
    }
    @Override
    public URL sanitizeURL(URL url) throws MalformedURLException {
        String u = url.toExternalForm();
@ -118,7 +121,6 @@ public class DeviantartRipper extends AbstractHTMLRipper {
                logger.info("Attempting to get full size image from " + thumb.attr("href"));
                fullSize = smallToFull(img.attr("src"), thumb.attr("href"));
            }
            if (fullSize == null) {
                continue;
            }
@ -131,7 +133,23 @@ public class DeviantartRipper extends AbstractHTMLRipper {
        }
        return imageURLs;
    }
    @Override
    public List<String> getDescriptionsFromPage(Document page) {
        List<String> textURLs = new ArrayList<String>();
        // Iterate over all thumbnails
        for (Element thumb : page.select("div.zones-container a.thumb")) {
            if (isStopped()) {
                break;
            }
            Element img = thumb.select("img").get(0);
            if (img.attr("transparent").equals("false")) {
                continue; // a.thumbs to other albums are invisible
            }
            textURLs.add(thumb.attr("href"));
        }
        return textURLs;
    }
    @Override
    public Document getNextPage(Document page) throws IOException {
        Elements nextButtons = page.select("li.next > a");
@ -185,6 +203,37 @@ public class DeviantartRipper extends AbstractHTMLRipper {
        return result.toString();
    }
    /**
     * Attempts to download description for image.
     * Comes in handy when people put entire stories in their description.
     * If no description was found, returns null.
     * @param page The page the description will be retrieved from
     * @return The description
     */
    @Override
    public String getDescription(String page) {
        try {
            // Fetch the image page
            Response resp = Http.url(page)
                                .referrer(this.url)
                                .cookies(cookies)
                                .response();
            cookies.putAll(resp.cookies());
            // Try to find the "Download" box
            Elements els = resp.parse().select("div[class=dev-description]");
            if (els.size() == 0) {
                throw new IOException("No description found");
            }
            // Full-size image
            String desc = els.text(); // TODO Figure out how to preserve newlines
            return desc;
        } catch (IOException ioe) {
                logger.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'");
                return null;
        }
    }
    /**
     * If largest resolution for image at 'thumb' is found, starts downloading
     * and returns null.
@ -202,7 +251,7 @@ public class DeviantartRipper extends AbstractHTMLRipper {
                                .response();
            cookies.putAll(resp.cookies());
-            // Try to find the "Download" box
+            // Try to find the description
            Elements els = resp.parse().select("a.dev-page-download");
            if (els.size() == 0) {
                throw new IOException("No download page found");