From 72b40394aeb452887a5591280c0bbde53365f6fd Mon Sep 17 00:00:00 2001 From: Wiiplay123 Date: Fri, 28 Nov 2014 09:50:04 -0600 Subject: [PATCH] Added deviantART description ripping It needs some work, notably the description text file doesn't have prefix. Also, fixed a resource leak in Utils.java --- .../ripme/ripper/AbstractRipper.java | 41 +++ .../ripme/ripper/DeviantartRipper.java | 309 ++++++++++++++++++ .../java/com/rarchives/ripme/utils/Utils.java | 2 + 3 files changed, 352 insertions(+) create mode 100644 src/main/java/com/rarchives/ripme/ripper/DeviantartRipper.java diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java index e1ea4a23..71cfe86e 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java @@ -2,6 +2,7 @@ package com.rarchives.ripme.ripper; import java.awt.Desktop; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; import java.lang.reflect.Constructor; import java.net.MalformedURLException; @@ -131,6 +132,46 @@ public abstract class AbstractRipper } return addURLToDownload(url, saveFileAs, referrer, cookies); } + public boolean saveText(URL url, String subdirectory, String referrer, Map cookies, String text) { + try { + stopCheck(); + } catch (IOException e) { + return false; + } + String saveAs = url.toExternalForm(); + saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1); + if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); } + if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); } + if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); } + if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); } + File saveFileAs; + try { + if (!subdirectory.equals("")) { + subdirectory = File.separator + subdirectory; + } + // TODO Get prefix working again, probably requires reworking a lot of stuff! + saveFileAs = new File( + workingDir.getCanonicalPath() + + subdirectory + // + prefix + + File.separator + + saveAs + + ".txt"); + // Write the file + FileOutputStream out = (new FileOutputStream(saveFileAs)); + out.write(text.getBytes()); + out.close(); + } catch (IOException e) { + logger.error("[!] Error creating save file path for description '" + url + "':", e); + return false; + } + logger.debug("Downloading " + url + "'s description to " + saveFileAs); + if (!saveFileAs.getParentFile().exists()) { + logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent())); + saveFileAs.getParentFile().mkdirs(); + } + return true; + } /** * Queues file to be downloaded and saved. With options. diff --git a/src/main/java/com/rarchives/ripme/ripper/DeviantartRipper.java b/src/main/java/com/rarchives/ripme/ripper/DeviantartRipper.java new file mode 100644 index 00000000..15b4a064 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/DeviantartRipper.java @@ -0,0 +1,309 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jsoup.Connection.Method; +import org.jsoup.Connection.Response; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.utils.Base64; +import com.rarchives.ripme.utils.Http; +import com.rarchives.ripme.utils.Utils; + +public class DeviantartRipper extends AbstractHTMLRipper { + + private static final int SLEEP_TIME = 2000; + + private Map cookies = new HashMap(); + private Set triedURLs = new HashSet(); + + public DeviantartRipper(URL url) throws IOException { + super(url); + } + + @Override + public String getHost() { + return "deviantart"; + } + @Override + public String getDomain() { + return "deviantart.com"; + } + + @Override + public URL sanitizeURL(URL url) throws MalformedURLException { + String u = url.toExternalForm(); + String subdir = "/"; + if (u.contains("catpath=scraps")) { + subdir = "scraps"; + } + u = u.replaceAll("\\?.*", "?catpath=" + subdir); + return new URL(u); + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Pattern p = Pattern.compile("^https?://([a-zA-Z0-9\\-]+)\\.deviantart\\.com(/gallery)?/?(\\?.*)?$"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + // Root gallery + if (url.toExternalForm().contains("catpath=scraps")) { + return m.group(1) + "_scraps"; + } + else { + return m.group(1); + } + } + p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.deviantart\\.com/gallery/([0-9]{1,}).*$"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + // Subgallery + return m.group(1) + "_" + m.group(2); + } + p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.deviantart\\.com/favou?rites/?$"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + // Subgallery + return m.group(1) + "_faves"; + } + throw new MalformedURLException("Expected URL format: http://username.deviantart.com/[/gallery/#####], got: " + url); + } + + @Override + public Document getFirstPage() throws IOException { + // Login + try { + cookies = loginToDeviantart(); + } catch (Exception e) { + logger.warn("Failed to login: ", e); + } + return Http.url(this.url) + .cookies(cookies) + .get(); + } + + @Override + public List getURLsFromPage(Document page) { + List imageURLs = new ArrayList(); + + // Iterate over all thumbnails + for (Element thumb : page.select("div.zones-container a.thumb")) { + if (isStopped()) { + break; + } + Element img = thumb.select("img").get(0); + if (img.attr("transparent").equals("false")) { + continue; // a.thumbs to other albums are invisible + } + + // Get full-sized image via helper methods + String fullSize = null; + String desc = null; + try { + fullSize = thumbToFull(img.attr("src"), true); + } catch (Exception e) { + logger.info("Attempting to get full size image from " + thumb.attr("href")); + fullSize = smallToFull(img.attr("src"), thumb.attr("href")); + } + try { + desc = smallToDescription(thumb.attr("href")); + } catch (Exception e) { + logger.info("Could not get description from " + thumb.attr("href")); + } + try { + saveText(new URL(thumb.attr("href")), "", this.url.toExternalForm(), cookies, desc); + } catch (MalformedURLException e) { + logger.info("Malformed URL while getting description from " + thumb.attr("href")); + } + if (fullSize == null) { + continue; + } + if (triedURLs.contains(fullSize)) { + logger.warn("Already tried to download " + fullSize); + continue; + } + triedURLs.add(fullSize); + imageURLs.add(fullSize); + } + return imageURLs; + } + + @Override + public Document getNextPage(Document page) throws IOException { + Elements nextButtons = page.select("li.next > a"); + if (nextButtons.size() == 0) { + throw new IOException("No next page found"); + } + Element a = nextButtons.first(); + if (a.hasClass("disabled")) { + throw new IOException("Hit end of pages"); + } + String nextPage = a.attr("href"); + if (nextPage.startsWith("/")) { + nextPage = "http://" + this.url.getHost() + nextPage; + } + if (!sleep(SLEEP_TIME)) { + throw new IOException("Interrupted while waiting to load next page: " + nextPage); + } + logger.info("Found next page: " + nextPage); + return Http.url(nextPage) + .cookies(cookies) + .get(); + } + + @Override + public void downloadURL(URL url, int index) { + addURLToDownload(url, getPrefix(index), "", this.url.toExternalForm(), cookies); + } + + /** + * Tries to get full size image from thumbnail URL + * @param thumb Thumbnail URL + * @param throwException Whether or not to throw exception when full size image isn't found + * @return Full-size image URL + * @throws Exception If it can't find the full-size URL + */ + public static String thumbToFull(String thumb, boolean throwException) throws Exception { + thumb = thumb.replace("http://th", "http://fc"); + List fields = new ArrayList(Arrays.asList(thumb.split("/"))); + fields.remove(4); + if (!fields.get(4).equals("f") && throwException) { + // Not a full-size image + throw new Exception("Can't get full size image from " + thumb); + } + StringBuilder result = new StringBuilder(); + for (int i = 0; i < fields.size(); i++) { + if (i > 0) { + result.append("/"); + } + result.append(fields.get(i)); + } + return result.toString(); + } + + /** + * Attempts to download description for image. + * Comes in handy when people put entire stories in their description. + * If no description was found, returns null. + * @param page The page the description will be retrieved from + * @return The description + */ + public String smallToDescription(String page) { + try { + // Fetch the image page + Response resp = Http.url(page) + .referrer(this.url) + .cookies(cookies) + .response(); + cookies.putAll(resp.cookies()); + + // Try to find the "Download" box + Elements els = resp.parse().select("div[class=dev-description]"); + if (els.size() == 0) { + throw new IOException("No description found"); + } + // Full-size image + String desc = els.text(); // TODO Figure out how to preserve newlines + return desc; + } catch (IOException ioe) { + logger.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'"); + return null; + } + } + + /** + * If largest resolution for image at 'thumb' is found, starts downloading + * and returns null. + * If it finds a larger resolution on another page, returns the image URL. + * @param thumb Thumbnail URL + * @param page Page the thumbnail is retrieved from + * @return Highest-resolution version of the image based on thumbnail URL and the page. + */ + public String smallToFull(String thumb, String page) { + try { + // Fetch the image page + Response resp = Http.url(page) + .referrer(this.url) + .cookies(cookies) + .response(); + cookies.putAll(resp.cookies()); + + // Try to find the description + Elements els = resp.parse().select("a.dev-page-download"); + if (els.size() == 0) { + throw new IOException("No download page found"); + } + // Full-size image + String fsimage = els.get(0).attr("href"); + return fsimage; + } catch (IOException ioe) { + try { + logger.info("Failed to get full size download image at " + page + " : '" + ioe.getMessage() + "'"); + String lessThanFull = thumbToFull(thumb, false); + logger.info("Falling back to less-than-full-size image " + lessThanFull); + return lessThanFull; + } catch (Exception e) { + return null; + } + } + } + + /** + * Logs into deviant art. Required to rip full-size NSFW content. + * @return Map of cookies containing session data. + */ + private Map loginToDeviantart() throws IOException { + // Populate postData fields + Map postData = new HashMap(); + String username = Utils.getConfigString("deviantart.username", new String(Base64.decode("Z3JhYnB5"))); + String password = Utils.getConfigString("deviantart.password", new String(Base64.decode("ZmFrZXJz"))); + if (username == null || password == null) { + throw new IOException("could not find username or password in config"); + } + Response resp = Http.url("http://www.deviantart.com/") + .response(); + for (Element input : resp.parse().select("form#form-login input[type=hidden]")) { + postData.put(input.attr("name"), input.attr("value")); + } + postData.put("username", username); + postData.put("password", password); + postData.put("remember_me", "1"); + + // Send login request + resp = Http.url("https://www.deviantart.com/users/login") + .userAgent(USER_AGENT) + .data(postData) + .cookies(resp.cookies()) + .method(Method.POST) + .response(); + + // Assert we are logged in + if (resp.hasHeader("Location") && resp.header("Location").contains("password")) { + // Wrong password + throw new IOException("Wrong password"); + } + if (resp.url().toExternalForm().contains("bad_form")) { + throw new IOException("Login form was incorrectly submitted"); + } + if (resp.cookie("auth_secure") == null || + resp.cookie("auth") == null) { + throw new IOException("No auth_secure or auth cookies received"); + } + // We are logged in, save the cookies + return resp.cookies(); + } +} diff --git a/src/main/java/com/rarchives/ripme/utils/Utils.java b/src/main/java/com/rarchives/ripme/utils/Utils.java index 99a4a667..c51de33e 100644 --- a/src/main/java/com/rarchives/ripme/utils/Utils.java +++ b/src/main/java/com/rarchives/ripme/utils/Utils.java @@ -231,10 +231,12 @@ public class Utils { classes.add(Class.forName(className)); } catch (ClassNotFoundException e) { logger.error("ClassNotFoundException loading " + className); + jarFile.close(); // Resource leak fix? throw new RuntimeException("ClassNotFoundException loading " + className); } } } + jarFile.close(); // Eclipse said not closing it would have a resource leak } catch (IOException e) { logger.error("Error while loading jar file:", e); throw new RuntimeException(pkgname + " (" + directory + ") does not appear to be a valid package", e);