Added FurAffinity Description Ripping

Also added description sleep time value to AbstractHTMLRipper, to avoid read timed out.
2015-05-29 13:26:48 -05:00 · 2015-05-29 13:26:48 -05:00 · e1fd37993d
commit e1fd37993d
parent 4871142252
2 changed files with 109 additions and 3 deletions
--- a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java
@ -56,6 +56,9 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
    public String getDescription(String page) throws IOException {
    	throw new IOException("getDescription not implemented"); // Do I do this or make an abstract function?
    }
+    public int descSleepTime() {
+        return 0;
+    }
    @Override
    public void rip() throws IOException {
        int index = 0;
@ -89,12 +92,14 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
                logger.debug("Fetching description(s) from " + doc.location());
            	List<String> textURLs = getDescriptionsFromPage(doc);
            	if (textURLs.size() > 0) {
+                    logger.debug("Found description link(s) from " + doc.location());
            		for (String textURL : textURLs) {
            			if (isStopped()) {
            				break;
            			}
            			textindex += 1;
-            			logger.debug("Getting decription from " + textURL);
+            			logger.debug("Getting description from " + textURL);
+                        sleep(descSleepTime());
            			String tempDesc = getDescription(textURL);
            			if (tempDesc != null) {
            			    logger.debug("Got description: " + tempDesc);
@ -125,6 +130,7 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
        waitForThreads();
    }
    public boolean saveText(URL url, String subdirectory, String text, int index) {
+        // Not the best for some cases, like FurAffinity. Overridden there.
        try {
            stopCheck();
        } catch (IOException e) {
@ -141,7 +147,7 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
            if (!subdirectory.equals("")) { // Not sure about this part
                subdirectory = File.separator + subdirectory;
            }
-            // TODO Get prefix working again, probably requires reworking a lot of stuff!
+            // TODO Get prefix working again, probably requires reworking a lot of stuff! (Might be fixed now)
            saveFileAs = new File(
                    workingDir.getCanonicalPath()
                    + subdirectory
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/FuraffinityRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/FuraffinityRipper.java
@ -1,5 +1,7 @@
 package com.rarchives.ripme.ripper.rippers;

+import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
@ -10,10 +12,13 @@ import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

+import com.rarchives.ripme.utils.Utils;
 import org.jsoup.Connection.Method;
 import org.jsoup.Connection.Response;
+import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
+import org.jsoup.safety.Whitelist;
 import org.jsoup.select.Elements;

 import com.rarchives.ripme.ripper.AbstractHTMLRipper;
@ -48,7 +53,10 @@ public class FuraffinityRipper extends AbstractHTMLRipper {
    public String getHost() {
        return "furaffinity";
    }
-
+    @Override
+    public boolean hasDescriptionSupport() {
+        return true;
+    }
    @Override
    public Document getFirstPage() throws IOException {
        if (cookies == null || cookies.size() == 0) {
@ -110,7 +118,99 @@ public class FuraffinityRipper extends AbstractHTMLRipper {
        }
        return urls;
    }
+    @Override
+    public List<String> getDescriptionsFromPage(Document page) {
+        List<String> urls = new ArrayList<String>();
+        Elements urlElements = page.select("b[id^=sid_]");
+        for (Element e : urlElements) {
+            urls.add(urlBase + e.select("a").first().attr("href"));
+            logger.debug("Desc2 " + urlBase + e.select("a").first().attr("href"));
+        }
+        return urls;
+    }
+    @Override
+    public int descSleepTime() {
+        return 400;
+    }
+    public String getDescription(String page) {
+        try {
+            // Fetch the image page
+            Response resp = Http.url(page)
+                    .referrer(this.url)
+                    .cookies(cookies)
+                    .response();
+            cookies.putAll(resp.cookies());

+            // Try to find the description
+            Elements els = resp.parse().select("td[class=alt1][width=\"70%\"]");
+            if (els.size() == 0) {
+                logger.debug("No description at " + page);
+                throw new IOException("No description found");
+            }
+            logger.debug("Description found!");
+            Document documentz = resp.parse();
+            Element ele = documentz.select("td[class=alt1][width=\"70%\"]").get(0); // This is where the description is.
+            // Would break completely if FurAffinity changed site layout.
+            documentz.outputSettings(new Document.OutputSettings().prettyPrint(false));
+            ele.select("br").append("\\n");
+            ele.select("p").prepend("\\n\\n");
+            logger.debug("Returning description at " + page);
+            String tempPage = Jsoup.clean(ele.html().replaceAll("\\\\n", System.getProperty("line.separator")), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
+            Element title = documentz.select("td[class=\"cat\"][valign=\"top\"] > b").get(0);
+            String tempText = title.text();
+            return tempText + "\n" + tempPage; // Overridden saveText takes first line and makes it the file name.
+        } catch (IOException ioe) {
+            logger.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'");
+            return null;
+        }
+    }
+    @Override
+    public boolean saveText(URL url, String subdirectory, String text, int index) {
+       //TODO Make this better please?
+       try {
+            stopCheck();
+        } catch (IOException e) {
+            return false;
+        }
+        String newText = "";
+        String saveAs = "";
+        File saveFileAs;
+        saveAs = text.split("\n")[0];
+        for (int i = 1;i < text.split("\n").length; i++) {
+             newText = newText.replace("\\","").replace("/","").replace("~","") + "\n" + text.split("\n")[i];
+        }
+        try {
+            if (!subdirectory.equals("")) {
+                subdirectory = File.separator + subdirectory;
+            }
+            int o = url.toString().lastIndexOf('/')-1;
+            String test = url.toString().substring(url.toString().lastIndexOf('/',o)+1);
+            test = test.replace("/",""); // This is probably not the best way to do this.
+            test = test.replace("\\",""); // CLOSE ENOUGH!
+            saveFileAs = new File(
+                    workingDir.getCanonicalPath()
+                            + subdirectory
+                            + File.separator
+                            + getPrefix(index)
+                            + saveAs
+                            + " "
+                            + test
+                            + ".txt");
+            // Write the file
+            FileOutputStream out = (new FileOutputStream(saveFileAs));
+            out.write(text.getBytes());
+            out.close();
+        } catch (IOException e) {
+            logger.error("[!] Error creating save file path for description '" + url + "':", e);
+            return false;
+        }
+        logger.debug("Downloading " + url + "'s description to " + saveFileAs);
+        if (!saveFileAs.getParentFile().exists()) {
+            logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent()));
+            saveFileAs.getParentFile().mkdirs();
+        }
+        return true;
+    }
    @Override
    public void downloadURL(URL url, int index) {
        furaffinityThreadPool.addThread(new FuraffinityDocumentThread(url));