From 72b40394aeb452887a5591280c0bbde53365f6fd Mon Sep 17 00:00:00 2001 From: Wiiplay123 Date: Fri, 28 Nov 2014 09:50:04 -0600 Subject: [PATCH 1/3] Added deviantART description ripping It needs some work, notably the description text file doesn't have prefix. Also, fixed a resource leak in Utils.java --- .../ripme/ripper/AbstractRipper.java | 41 +++ .../ripme/ripper/DeviantartRipper.java | 309 ++++++++++++++++++ .../java/com/rarchives/ripme/utils/Utils.java | 2 + 3 files changed, 352 insertions(+) create mode 100644 src/main/java/com/rarchives/ripme/ripper/DeviantartRipper.java diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java index e1ea4a23..71cfe86e 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java @@ -2,6 +2,7 @@ package com.rarchives.ripme.ripper; import java.awt.Desktop; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; import java.lang.reflect.Constructor; import java.net.MalformedURLException; @@ -131,6 +132,46 @@ public abstract class AbstractRipper } return addURLToDownload(url, saveFileAs, referrer, cookies); } + public boolean saveText(URL url, String subdirectory, String referrer, Map cookies, String text) { + try { + stopCheck(); + } catch (IOException e) { + return false; + } + String saveAs = url.toExternalForm(); + saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1); + if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); } + if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); } + if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); } + if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); } + File saveFileAs; + try { + if (!subdirectory.equals("")) { + subdirectory = File.separator + subdirectory; + } + // TODO Get prefix working again, probably requires reworking a lot of stuff! + saveFileAs = new File( + workingDir.getCanonicalPath() + + subdirectory + // + prefix + + File.separator + + saveAs + + ".txt"); + // Write the file + FileOutputStream out = (new FileOutputStream(saveFileAs)); + out.write(text.getBytes()); + out.close(); + } catch (IOException e) { + logger.error("[!] Error creating save file path for description '" + url + "':", e); + return false; + } + logger.debug("Downloading " + url + "'s description to " + saveFileAs); + if (!saveFileAs.getParentFile().exists()) { + logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent())); + saveFileAs.getParentFile().mkdirs(); + } + return true; + } /** * Queues file to be downloaded and saved. With options. diff --git a/src/main/java/com/rarchives/ripme/ripper/DeviantartRipper.java b/src/main/java/com/rarchives/ripme/ripper/DeviantartRipper.java new file mode 100644 index 00000000..15b4a064 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/DeviantartRipper.java @@ -0,0 +1,309 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jsoup.Connection.Method; +import org.jsoup.Connection.Response; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.utils.Base64; +import com.rarchives.ripme.utils.Http; +import com.rarchives.ripme.utils.Utils; + +public class DeviantartRipper extends AbstractHTMLRipper { + + private static final int SLEEP_TIME = 2000; + + private Map cookies = new HashMap(); + private Set triedURLs = new HashSet(); + + public DeviantartRipper(URL url) throws IOException { + super(url); + } + + @Override + public String getHost() { + return "deviantart"; + } + @Override + public String getDomain() { + return "deviantart.com"; + } + + @Override + public URL sanitizeURL(URL url) throws MalformedURLException { + String u = url.toExternalForm(); + String subdir = "/"; + if (u.contains("catpath=scraps")) { + subdir = "scraps"; + } + u = u.replaceAll("\\?.*", "?catpath=" + subdir); + return new URL(u); + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Pattern p = Pattern.compile("^https?://([a-zA-Z0-9\\-]+)\\.deviantart\\.com(/gallery)?/?(\\?.*)?$"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + // Root gallery + if (url.toExternalForm().contains("catpath=scraps")) { + return m.group(1) + "_scraps"; + } + else { + return m.group(1); + } + } + p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.deviantart\\.com/gallery/([0-9]{1,}).*$"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + // Subgallery + return m.group(1) + "_" + m.group(2); + } + p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.deviantart\\.com/favou?rites/?$"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + // Subgallery + return m.group(1) + "_faves"; + } + throw new MalformedURLException("Expected URL format: http://username.deviantart.com/[/gallery/#####], got: " + url); + } + + @Override + public Document getFirstPage() throws IOException { + // Login + try { + cookies = loginToDeviantart(); + } catch (Exception e) { + logger.warn("Failed to login: ", e); + } + return Http.url(this.url) + .cookies(cookies) + .get(); + } + + @Override + public List getURLsFromPage(Document page) { + List imageURLs = new ArrayList(); + + // Iterate over all thumbnails + for (Element thumb : page.select("div.zones-container a.thumb")) { + if (isStopped()) { + break; + } + Element img = thumb.select("img").get(0); + if (img.attr("transparent").equals("false")) { + continue; // a.thumbs to other albums are invisible + } + + // Get full-sized image via helper methods + String fullSize = null; + String desc = null; + try { + fullSize = thumbToFull(img.attr("src"), true); + } catch (Exception e) { + logger.info("Attempting to get full size image from " + thumb.attr("href")); + fullSize = smallToFull(img.attr("src"), thumb.attr("href")); + } + try { + desc = smallToDescription(thumb.attr("href")); + } catch (Exception e) { + logger.info("Could not get description from " + thumb.attr("href")); + } + try { + saveText(new URL(thumb.attr("href")), "", this.url.toExternalForm(), cookies, desc); + } catch (MalformedURLException e) { + logger.info("Malformed URL while getting description from " + thumb.attr("href")); + } + if (fullSize == null) { + continue; + } + if (triedURLs.contains(fullSize)) { + logger.warn("Already tried to download " + fullSize); + continue; + } + triedURLs.add(fullSize); + imageURLs.add(fullSize); + } + return imageURLs; + } + + @Override + public Document getNextPage(Document page) throws IOException { + Elements nextButtons = page.select("li.next > a"); + if (nextButtons.size() == 0) { + throw new IOException("No next page found"); + } + Element a = nextButtons.first(); + if (a.hasClass("disabled")) { + throw new IOException("Hit end of pages"); + } + String nextPage = a.attr("href"); + if (nextPage.startsWith("/")) { + nextPage = "http://" + this.url.getHost() + nextPage; + } + if (!sleep(SLEEP_TIME)) { + throw new IOException("Interrupted while waiting to load next page: " + nextPage); + } + logger.info("Found next page: " + nextPage); + return Http.url(nextPage) + .cookies(cookies) + .get(); + } + + @Override + public void downloadURL(URL url, int index) { + addURLToDownload(url, getPrefix(index), "", this.url.toExternalForm(), cookies); + } + + /** + * Tries to get full size image from thumbnail URL + * @param thumb Thumbnail URL + * @param throwException Whether or not to throw exception when full size image isn't found + * @return Full-size image URL + * @throws Exception If it can't find the full-size URL + */ + public static String thumbToFull(String thumb, boolean throwException) throws Exception { + thumb = thumb.replace("http://th", "http://fc"); + List fields = new ArrayList(Arrays.asList(thumb.split("/"))); + fields.remove(4); + if (!fields.get(4).equals("f") && throwException) { + // Not a full-size image + throw new Exception("Can't get full size image from " + thumb); + } + StringBuilder result = new StringBuilder(); + for (int i = 0; i < fields.size(); i++) { + if (i > 0) { + result.append("/"); + } + result.append(fields.get(i)); + } + return result.toString(); + } + + /** + * Attempts to download description for image. + * Comes in handy when people put entire stories in their description. + * If no description was found, returns null. + * @param page The page the description will be retrieved from + * @return The description + */ + public String smallToDescription(String page) { + try { + // Fetch the image page + Response resp = Http.url(page) + .referrer(this.url) + .cookies(cookies) + .response(); + cookies.putAll(resp.cookies()); + + // Try to find the "Download" box + Elements els = resp.parse().select("div[class=dev-description]"); + if (els.size() == 0) { + throw new IOException("No description found"); + } + // Full-size image + String desc = els.text(); // TODO Figure out how to preserve newlines + return desc; + } catch (IOException ioe) { + logger.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'"); + return null; + } + } + + /** + * If largest resolution for image at 'thumb' is found, starts downloading + * and returns null. + * If it finds a larger resolution on another page, returns the image URL. + * @param thumb Thumbnail URL + * @param page Page the thumbnail is retrieved from + * @return Highest-resolution version of the image based on thumbnail URL and the page. + */ + public String smallToFull(String thumb, String page) { + try { + // Fetch the image page + Response resp = Http.url(page) + .referrer(this.url) + .cookies(cookies) + .response(); + cookies.putAll(resp.cookies()); + + // Try to find the description + Elements els = resp.parse().select("a.dev-page-download"); + if (els.size() == 0) { + throw new IOException("No download page found"); + } + // Full-size image + String fsimage = els.get(0).attr("href"); + return fsimage; + } catch (IOException ioe) { + try { + logger.info("Failed to get full size download image at " + page + " : '" + ioe.getMessage() + "'"); + String lessThanFull = thumbToFull(thumb, false); + logger.info("Falling back to less-than-full-size image " + lessThanFull); + return lessThanFull; + } catch (Exception e) { + return null; + } + } + } + + /** + * Logs into deviant art. Required to rip full-size NSFW content. + * @return Map of cookies containing session data. + */ + private Map loginToDeviantart() throws IOException { + // Populate postData fields + Map postData = new HashMap(); + String username = Utils.getConfigString("deviantart.username", new String(Base64.decode("Z3JhYnB5"))); + String password = Utils.getConfigString("deviantart.password", new String(Base64.decode("ZmFrZXJz"))); + if (username == null || password == null) { + throw new IOException("could not find username or password in config"); + } + Response resp = Http.url("http://www.deviantart.com/") + .response(); + for (Element input : resp.parse().select("form#form-login input[type=hidden]")) { + postData.put(input.attr("name"), input.attr("value")); + } + postData.put("username", username); + postData.put("password", password); + postData.put("remember_me", "1"); + + // Send login request + resp = Http.url("https://www.deviantart.com/users/login") + .userAgent(USER_AGENT) + .data(postData) + .cookies(resp.cookies()) + .method(Method.POST) + .response(); + + // Assert we are logged in + if (resp.hasHeader("Location") && resp.header("Location").contains("password")) { + // Wrong password + throw new IOException("Wrong password"); + } + if (resp.url().toExternalForm().contains("bad_form")) { + throw new IOException("Login form was incorrectly submitted"); + } + if (resp.cookie("auth_secure") == null || + resp.cookie("auth") == null) { + throw new IOException("No auth_secure or auth cookies received"); + } + // We are logged in, save the cookies + return resp.cookies(); + } +} diff --git a/src/main/java/com/rarchives/ripme/utils/Utils.java b/src/main/java/com/rarchives/ripme/utils/Utils.java index 99a4a667..c51de33e 100644 --- a/src/main/java/com/rarchives/ripme/utils/Utils.java +++ b/src/main/java/com/rarchives/ripme/utils/Utils.java @@ -231,10 +231,12 @@ public class Utils { classes.add(Class.forName(className)); } catch (ClassNotFoundException e) { logger.error("ClassNotFoundException loading " + className); + jarFile.close(); // Resource leak fix? throw new RuntimeException("ClassNotFoundException loading " + className); } } } + jarFile.close(); // Eclipse said not closing it would have a resource leak } catch (IOException e) { logger.error("Error while loading jar file:", e); throw new RuntimeException(pkgname + " (" + directory + ") does not appear to be a valid package", e); From 16e0d27f66e714f598b86e39576004ed41a6c022 Mon Sep 17 00:00:00 2001 From: Wiiplay123 Date: Fri, 28 Nov 2014 22:59:39 -0600 Subject: [PATCH 2/3] Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. --- .../ripme/ripper/AbstractHTMLRipper.java | 75 +++++++++++++++++-- .../ripme/ripper/AbstractRipper.java | 42 +---------- .../ripper/rippers/DeviantartRipper.java | 57 +++++++++++++- 3 files changed, 124 insertions(+), 50 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java index be73c717..6953704c 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java @@ -1,5 +1,7 @@ package com.rarchives.ripme.ripper; +import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; @@ -14,7 +16,7 @@ import com.rarchives.ripme.utils.Utils; * Simplified ripper, designed for ripping from sites by parsing HTML. */ public abstract class AbstractHTMLRipper extends AlbumRipper { - + public AbstractHTMLRipper(URL url) throws IOException { super(url); } @@ -27,6 +29,9 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { throw new IOException("getNextPage not implemented"); } public abstract List getURLsFromPage(Document page); + public List getDescriptionsFromPage(Document doc) throws IOException { + throw new IOException("getDescriptionsFromPage not implemented"); // Do I do this or make an abstract function? + } public abstract void downloadURL(URL url, int index); public DownloadThreadPool getThreadPool() { return null; @@ -45,21 +50,27 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { public URL sanitizeURL(URL url) throws MalformedURLException { return url; } - + public boolean hasDescriptionSupport() { + return false; + } + public String getDescription(String page) throws IOException { + throw new IOException("getDescription not implemented"); // Do I do this or make an abstract function? + } @Override public void rip() throws IOException { int index = 0; + int textindex = 0; logger.info("Retrieving " + this.url); sendUpdate(STATUS.LOADING_RESOURCE, this.url.toExternalForm()); Document doc = getFirstPage(); - + while (doc != null) { List imageURLs = getURLsFromPage(doc); if (imageURLs.size() == 0) { throw new IOException("No images found at " + doc.location()); } - + for (String imageURL : imageURLs) { if (isStopped()) { break; @@ -67,6 +78,21 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { index += 1; downloadURL(new URL(imageURL), index); } + if (hasDescriptionSupport()) { + List textURLs = getDescriptionsFromPage(doc); + if (textURLs.size() > 0) { + for (String textURL : textURLs) { + if (isStopped()) { + break; + } + textindex += 1; + String tempDesc = getDescription(textURL); + if (tempDesc != null) { + saveText(new URL(textURL), "", tempDesc, textindex); + } + } + } + } if (isStopped()) { break; @@ -87,7 +113,46 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { } waitForThreads(); } - + public boolean saveText(URL url, String subdirectory, String text, int index) { + try { + stopCheck(); + } catch (IOException e) { + return false; + } + String saveAs = url.toExternalForm(); + saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1); + if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); } + if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); } + if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); } + if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); } + File saveFileAs; + try { + if (!subdirectory.equals("")) { // Not sure about this part + subdirectory = File.separator + subdirectory; + } + // TODO Get prefix working again, probably requires reworking a lot of stuff! + saveFileAs = new File( + workingDir.getCanonicalPath() + + subdirectory + + File.separator + + getPrefix(index) + + saveAs + + ".txt"); + // Write the file + FileOutputStream out = (new FileOutputStream(saveFileAs)); + out.write(text.getBytes()); + out.close(); + } catch (IOException e) { + logger.error("[!] Error creating save file path for description '" + url + "':", e); + return false; + } + logger.debug("Downloading " + url + "'s description to " + saveFileAs); + if (!saveFileAs.getParentFile().exists()) { + logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent())); + saveFileAs.getParentFile().mkdirs(); + } + return true; + } public String getPrefix(int index) { String prefix = ""; if (keepSortOrder() && Utils.getConfigBoolean("download.save_order", true)) { diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java index 71cfe86e..33b5fd45 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java @@ -2,7 +2,6 @@ package com.rarchives.ripme.ripper; import java.awt.Desktop; import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; import java.lang.reflect.Constructor; import java.net.MalformedURLException; @@ -132,46 +131,7 @@ public abstract class AbstractRipper } return addURLToDownload(url, saveFileAs, referrer, cookies); } - public boolean saveText(URL url, String subdirectory, String referrer, Map cookies, String text) { - try { - stopCheck(); - } catch (IOException e) { - return false; - } - String saveAs = url.toExternalForm(); - saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1); - if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); } - if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); } - if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); } - if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); } - File saveFileAs; - try { - if (!subdirectory.equals("")) { - subdirectory = File.separator + subdirectory; - } - // TODO Get prefix working again, probably requires reworking a lot of stuff! - saveFileAs = new File( - workingDir.getCanonicalPath() - + subdirectory - // + prefix - + File.separator - + saveAs - + ".txt"); - // Write the file - FileOutputStream out = (new FileOutputStream(saveFileAs)); - out.write(text.getBytes()); - out.close(); - } catch (IOException e) { - logger.error("[!] Error creating save file path for description '" + url + "':", e); - return false; - } - logger.debug("Downloading " + url + "'s description to " + saveFileAs); - if (!saveFileAs.getParentFile().exists()) { - logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent())); - saveFileAs.getParentFile().mkdirs(); - } - return true; - } + /** * Queues file to be downloaded and saved. With options. diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java index 2c88c690..4e9c51c8 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java @@ -43,7 +43,10 @@ public class DeviantartRipper extends AbstractHTMLRipper { public String getDomain() { return "deviantart.com"; } - + @Override + public boolean hasDescriptionSupport() { + return true; + } @Override public URL sanitizeURL(URL url) throws MalformedURLException { String u = url.toExternalForm(); @@ -118,7 +121,6 @@ public class DeviantartRipper extends AbstractHTMLRipper { logger.info("Attempting to get full size image from " + thumb.attr("href")); fullSize = smallToFull(img.attr("src"), thumb.attr("href")); } - if (fullSize == null) { continue; } @@ -131,7 +133,23 @@ public class DeviantartRipper extends AbstractHTMLRipper { } return imageURLs; } - + @Override + public List getDescriptionsFromPage(Document page) { + List textURLs = new ArrayList(); + + // Iterate over all thumbnails + for (Element thumb : page.select("div.zones-container a.thumb")) { + if (isStopped()) { + break; + } + Element img = thumb.select("img").get(0); + if (img.attr("transparent").equals("false")) { + continue; // a.thumbs to other albums are invisible + } + textURLs.add(thumb.attr("href")); + } + return textURLs; + } @Override public Document getNextPage(Document page) throws IOException { Elements nextButtons = page.select("li.next > a"); @@ -184,7 +202,38 @@ public class DeviantartRipper extends AbstractHTMLRipper { } return result.toString(); } + + /** + * Attempts to download description for image. + * Comes in handy when people put entire stories in their description. + * If no description was found, returns null. + * @param page The page the description will be retrieved from + * @return The description + */ + @Override + public String getDescription(String page) { + try { + // Fetch the image page + Response resp = Http.url(page) + .referrer(this.url) + .cookies(cookies) + .response(); + cookies.putAll(resp.cookies()); + // Try to find the "Download" box + Elements els = resp.parse().select("div[class=dev-description]"); + if (els.size() == 0) { + throw new IOException("No description found"); + } + // Full-size image + String desc = els.text(); // TODO Figure out how to preserve newlines + return desc; + } catch (IOException ioe) { + logger.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'"); + return null; + } + } + /** * If largest resolution for image at 'thumb' is found, starts downloading * and returns null. @@ -202,7 +251,7 @@ public class DeviantartRipper extends AbstractHTMLRipper { .response(); cookies.putAll(resp.cookies()); - // Try to find the "Download" box + // Try to find the description Elements els = resp.parse().select("a.dev-page-download"); if (els.size() == 0) { throw new IOException("No download page found"); From be89649466d4e77f44134b9ef6b4d5e07f68dd45 Mon Sep 17 00:00:00 2001 From: Wiiplay123 Date: Sat, 29 Nov 2014 23:14:57 -0600 Subject: [PATCH 3/3] Added line breaks to deviantART ripper Finally added the support for newlines that I've been going on about in the other commits. Also got rid of a comment that shouldn't have been there, as well as fixing the fact that I just broke everything two commits back and didn't notice until just now. Sorry about that! --- .../ripme/ripper/DeviantartRipper.java | 309 ------------------ .../ripper/rippers/DeviantartRipper.java | 16 +- 2 files changed, 11 insertions(+), 314 deletions(-) delete mode 100644 src/main/java/com/rarchives/ripme/ripper/DeviantartRipper.java diff --git a/src/main/java/com/rarchives/ripme/ripper/DeviantartRipper.java b/src/main/java/com/rarchives/ripme/ripper/DeviantartRipper.java deleted file mode 100644 index 15b4a064..00000000 --- a/src/main/java/com/rarchives/ripme/ripper/DeviantartRipper.java +++ /dev/null @@ -1,309 +0,0 @@ -package com.rarchives.ripme.ripper.rippers; - -import java.io.IOException; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.jsoup.Connection.Method; -import org.jsoup.Connection.Response; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - -import com.rarchives.ripme.ripper.AbstractHTMLRipper; -import com.rarchives.ripme.utils.Base64; -import com.rarchives.ripme.utils.Http; -import com.rarchives.ripme.utils.Utils; - -public class DeviantartRipper extends AbstractHTMLRipper { - - private static final int SLEEP_TIME = 2000; - - private Map cookies = new HashMap(); - private Set triedURLs = new HashSet(); - - public DeviantartRipper(URL url) throws IOException { - super(url); - } - - @Override - public String getHost() { - return "deviantart"; - } - @Override - public String getDomain() { - return "deviantart.com"; - } - - @Override - public URL sanitizeURL(URL url) throws MalformedURLException { - String u = url.toExternalForm(); - String subdir = "/"; - if (u.contains("catpath=scraps")) { - subdir = "scraps"; - } - u = u.replaceAll("\\?.*", "?catpath=" + subdir); - return new URL(u); - } - - @Override - public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("^https?://([a-zA-Z0-9\\-]+)\\.deviantart\\.com(/gallery)?/?(\\?.*)?$"); - Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - // Root gallery - if (url.toExternalForm().contains("catpath=scraps")) { - return m.group(1) + "_scraps"; - } - else { - return m.group(1); - } - } - p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.deviantart\\.com/gallery/([0-9]{1,}).*$"); - m = p.matcher(url.toExternalForm()); - if (m.matches()) { - // Subgallery - return m.group(1) + "_" + m.group(2); - } - p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.deviantart\\.com/favou?rites/?$"); - m = p.matcher(url.toExternalForm()); - if (m.matches()) { - // Subgallery - return m.group(1) + "_faves"; - } - throw new MalformedURLException("Expected URL format: http://username.deviantart.com/[/gallery/#####], got: " + url); - } - - @Override - public Document getFirstPage() throws IOException { - // Login - try { - cookies = loginToDeviantart(); - } catch (Exception e) { - logger.warn("Failed to login: ", e); - } - return Http.url(this.url) - .cookies(cookies) - .get(); - } - - @Override - public List getURLsFromPage(Document page) { - List imageURLs = new ArrayList(); - - // Iterate over all thumbnails - for (Element thumb : page.select("div.zones-container a.thumb")) { - if (isStopped()) { - break; - } - Element img = thumb.select("img").get(0); - if (img.attr("transparent").equals("false")) { - continue; // a.thumbs to other albums are invisible - } - - // Get full-sized image via helper methods - String fullSize = null; - String desc = null; - try { - fullSize = thumbToFull(img.attr("src"), true); - } catch (Exception e) { - logger.info("Attempting to get full size image from " + thumb.attr("href")); - fullSize = smallToFull(img.attr("src"), thumb.attr("href")); - } - try { - desc = smallToDescription(thumb.attr("href")); - } catch (Exception e) { - logger.info("Could not get description from " + thumb.attr("href")); - } - try { - saveText(new URL(thumb.attr("href")), "", this.url.toExternalForm(), cookies, desc); - } catch (MalformedURLException e) { - logger.info("Malformed URL while getting description from " + thumb.attr("href")); - } - if (fullSize == null) { - continue; - } - if (triedURLs.contains(fullSize)) { - logger.warn("Already tried to download " + fullSize); - continue; - } - triedURLs.add(fullSize); - imageURLs.add(fullSize); - } - return imageURLs; - } - - @Override - public Document getNextPage(Document page) throws IOException { - Elements nextButtons = page.select("li.next > a"); - if (nextButtons.size() == 0) { - throw new IOException("No next page found"); - } - Element a = nextButtons.first(); - if (a.hasClass("disabled")) { - throw new IOException("Hit end of pages"); - } - String nextPage = a.attr("href"); - if (nextPage.startsWith("/")) { - nextPage = "http://" + this.url.getHost() + nextPage; - } - if (!sleep(SLEEP_TIME)) { - throw new IOException("Interrupted while waiting to load next page: " + nextPage); - } - logger.info("Found next page: " + nextPage); - return Http.url(nextPage) - .cookies(cookies) - .get(); - } - - @Override - public void downloadURL(URL url, int index) { - addURLToDownload(url, getPrefix(index), "", this.url.toExternalForm(), cookies); - } - - /** - * Tries to get full size image from thumbnail URL - * @param thumb Thumbnail URL - * @param throwException Whether or not to throw exception when full size image isn't found - * @return Full-size image URL - * @throws Exception If it can't find the full-size URL - */ - public static String thumbToFull(String thumb, boolean throwException) throws Exception { - thumb = thumb.replace("http://th", "http://fc"); - List fields = new ArrayList(Arrays.asList(thumb.split("/"))); - fields.remove(4); - if (!fields.get(4).equals("f") && throwException) { - // Not a full-size image - throw new Exception("Can't get full size image from " + thumb); - } - StringBuilder result = new StringBuilder(); - for (int i = 0; i < fields.size(); i++) { - if (i > 0) { - result.append("/"); - } - result.append(fields.get(i)); - } - return result.toString(); - } - - /** - * Attempts to download description for image. - * Comes in handy when people put entire stories in their description. - * If no description was found, returns null. - * @param page The page the description will be retrieved from - * @return The description - */ - public String smallToDescription(String page) { - try { - // Fetch the image page - Response resp = Http.url(page) - .referrer(this.url) - .cookies(cookies) - .response(); - cookies.putAll(resp.cookies()); - - // Try to find the "Download" box - Elements els = resp.parse().select("div[class=dev-description]"); - if (els.size() == 0) { - throw new IOException("No description found"); - } - // Full-size image - String desc = els.text(); // TODO Figure out how to preserve newlines - return desc; - } catch (IOException ioe) { - logger.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'"); - return null; - } - } - - /** - * If largest resolution for image at 'thumb' is found, starts downloading - * and returns null. - * If it finds a larger resolution on another page, returns the image URL. - * @param thumb Thumbnail URL - * @param page Page the thumbnail is retrieved from - * @return Highest-resolution version of the image based on thumbnail URL and the page. - */ - public String smallToFull(String thumb, String page) { - try { - // Fetch the image page - Response resp = Http.url(page) - .referrer(this.url) - .cookies(cookies) - .response(); - cookies.putAll(resp.cookies()); - - // Try to find the description - Elements els = resp.parse().select("a.dev-page-download"); - if (els.size() == 0) { - throw new IOException("No download page found"); - } - // Full-size image - String fsimage = els.get(0).attr("href"); - return fsimage; - } catch (IOException ioe) { - try { - logger.info("Failed to get full size download image at " + page + " : '" + ioe.getMessage() + "'"); - String lessThanFull = thumbToFull(thumb, false); - logger.info("Falling back to less-than-full-size image " + lessThanFull); - return lessThanFull; - } catch (Exception e) { - return null; - } - } - } - - /** - * Logs into deviant art. Required to rip full-size NSFW content. - * @return Map of cookies containing session data. - */ - private Map loginToDeviantart() throws IOException { - // Populate postData fields - Map postData = new HashMap(); - String username = Utils.getConfigString("deviantart.username", new String(Base64.decode("Z3JhYnB5"))); - String password = Utils.getConfigString("deviantart.password", new String(Base64.decode("ZmFrZXJz"))); - if (username == null || password == null) { - throw new IOException("could not find username or password in config"); - } - Response resp = Http.url("http://www.deviantart.com/") - .response(); - for (Element input : resp.parse().select("form#form-login input[type=hidden]")) { - postData.put(input.attr("name"), input.attr("value")); - } - postData.put("username", username); - postData.put("password", password); - postData.put("remember_me", "1"); - - // Send login request - resp = Http.url("https://www.deviantart.com/users/login") - .userAgent(USER_AGENT) - .data(postData) - .cookies(resp.cookies()) - .method(Method.POST) - .response(); - - // Assert we are logged in - if (resp.hasHeader("Location") && resp.header("Location").contains("password")) { - // Wrong password - throw new IOException("Wrong password"); - } - if (resp.url().toExternalForm().contains("bad_form")) { - throw new IOException("Login form was incorrectly submitted"); - } - if (resp.cookie("auth_secure") == null || - resp.cookie("auth") == null) { - throw new IOException("No auth_secure or auth cookies received"); - } - // We are logged in, save the cookies - return resp.cookies(); - } -} diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java index 4e9c51c8..f7f1ed93 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java @@ -15,8 +15,10 @@ import java.util.regex.Pattern; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; +import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.safety.Whitelist; import org.jsoup.select.Elements; import com.rarchives.ripme.ripper.AbstractHTMLRipper; @@ -220,14 +222,18 @@ public class DeviantartRipper extends AbstractHTMLRipper { .response(); cookies.putAll(resp.cookies()); - // Try to find the "Download" box + // Try to find the description Elements els = resp.parse().select("div[class=dev-description]"); if (els.size() == 0) { throw new IOException("No description found"); } - // Full-size image - String desc = els.text(); // TODO Figure out how to preserve newlines - return desc; + Document documentz = resp.parse(); + Element ele = documentz.select("div[class=dev-description]").get(0); + documentz.outputSettings(new Document.OutputSettings().prettyPrint(false)); + ele.select("br").append("\\n"); + ele.select("p").prepend("\\n\\n"); + return Jsoup.clean(ele.html().replaceAll("\\\\n", System.getProperty("line.separator")), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); + // TODO Make this not make a newline if someone just types \n into the description. } catch (IOException ioe) { logger.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'"); return null; @@ -251,7 +257,7 @@ public class DeviantartRipper extends AbstractHTMLRipper { .response(); cookies.putAll(resp.cookies()); - // Try to find the description + // Try to find the download button Elements els = resp.parse().select("a.dev-page-download"); if (els.size() == 0) { throw new IOException("No download page found");