Merge #516 @Wiiplay123: Fixed deviantArt ripping

This commit is contained in:
MetaPrime 2017-05-15 10:30:30 -07:00
commit f08ae4ce4e
2 changed files with 144 additions and 51 deletions

View File

@ -53,11 +53,11 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
public boolean hasDescriptionSupport() { public boolean hasDescriptionSupport() {
return false; return false;
} }
public String getDescription(String page) throws IOException { public String[] getDescription(String url,Document page) throws IOException {
throw new IOException("getDescription not implemented"); // Do I do this or make an abstract function? throw new IOException("getDescription not implemented"); // Do I do this or make an abstract function?
} }
public int descSleepTime() { public int descSleepTime() {
return 0; return 100;
} }
@Override @Override
public void rip() throws IOException { public void rip() throws IOException {
@ -99,12 +99,23 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
} }
textindex += 1; textindex += 1;
logger.debug("Getting description from " + textURL); logger.debug("Getting description from " + textURL);
sleep(descSleepTime()); String[] tempDesc = getDescription(textURL,doc);
String tempDesc = getDescription(textURL);
if (tempDesc != null) { if (tempDesc != null) {
logger.debug("Got description: " + tempDesc); if (Utils.getConfigBoolean("file.overwrite", false) || !(new File(
saveText(new URL(textURL), "", tempDesc, textindex); workingDir.getCanonicalPath()
+ ""
+ File.separator
+ getPrefix(index)
+ (tempDesc.length > 1 ? tempDesc[1] : fileNameFromURL(new URL(textURL)))
+ ".txt").exists())) {
logger.debug("Got description from " + textURL);
saveText(new URL(textURL), "", tempDesc[0], textindex, (tempDesc.length > 1 ? tempDesc[1] : fileNameFromURL(new URL(textURL))));
sleep(descSleepTime());
} else {
logger.debug("Description from " + textURL + " already exists.");
}
} }
} }
} }
} }
@ -129,19 +140,27 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
} }
waitForThreads(); waitForThreads();
} }
public String fileNameFromURL(URL url) {
String saveAs = url.toExternalForm();
if (saveAs.substring(saveAs.length() - 1) == "/") { saveAs = saveAs.substring(0,saveAs.length() - 1) ;}
saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1);
if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); }
if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); }
if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); }
if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); }
return saveAs;
}
public boolean saveText(URL url, String subdirectory, String text, int index) { public boolean saveText(URL url, String subdirectory, String text, int index) {
String saveAs = fileNameFromURL(url);
return saveText(url,subdirectory,text,index,saveAs);
}
public boolean saveText(URL url, String subdirectory, String text, int index, String fileName) {
// Not the best for some cases, like FurAffinity. Overridden there. // Not the best for some cases, like FurAffinity. Overridden there.
try { try {
stopCheck(); stopCheck();
} catch (IOException e) { } catch (IOException e) {
return false; return false;
} }
String saveAs = url.toExternalForm();
saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1);
if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); }
if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); }
if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); }
if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); }
File saveFileAs; File saveFileAs;
try { try {
if (!subdirectory.equals("")) { // Not sure about this part if (!subdirectory.equals("")) { // Not sure about this part
@ -153,7 +172,7 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
+ subdirectory + subdirectory
+ File.separator + File.separator
+ getPrefix(index) + getPrefix(index)
+ saveAs + fileName
+ ".txt"); + ".txt");
// Write the file // Write the file
FileOutputStream out = (new FileOutputStream(saveFileAs)); FileOutputStream out = (new FileOutputStream(saveFileAs));

View File

@ -1,6 +1,7 @@
package com.rarchives.ripme.ripper.rippers; package com.rarchives.ripme.ripper.rippers;
import java.io.IOException; import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.util.ArrayList; import java.util.ArrayList;
@ -114,18 +115,39 @@ public class DeviantartRipper extends AbstractHTMLRipper {
cookies = loginToDeviantart(); cookies = loginToDeviantart();
} catch (Exception e) { } catch (Exception e) {
logger.warn("Failed to login: ", e); logger.warn("Failed to login: ", e);
cookies.put("agegate_state","1"); // Bypasses the age gate
} }
return Http.url(this.url) return Http.url(this.url)
.cookies(cookies) .cookies(cookies)
.get(); .get();
} }
public String jsonToImage(Document page,String id) {
Elements js = page.select("script[type=\"text/javascript\"]");
for (Element tag : js) {
if (tag.html().contains("window.__pageload")) {
try {
String script = tag.html();
script = script.substring(script.indexOf("window.__pageload"));
if (script.indexOf(id) < 0) {
continue;
}
script = script.substring(script.indexOf(id));
// first },"src":"url" after id
script = script.substring(script.indexOf("},\"src\":\"") + 9, script.indexOf("\",\"type\""));
return script.replace("\\/", "/");
} catch (StringIndexOutOfBoundsException e) {
logger.debug("Unable to get json link from " + page.location());
}
}
}
return null;
}
@Override @Override
public List<String> getURLsFromPage(Document page) { public List<String> getURLsFromPage(Document page) {
List<String> imageURLs = new ArrayList<String>(); List<String> imageURLs = new ArrayList<String>();
// Iterate over all thumbnails // Iterate over all thumbnails
for (Element thumb : page.select("div.zones-container a.thumb")) { for (Element thumb : page.select("div.zones-container span.thumb")) {
if (isStopped()) { if (isStopped()) {
break; break;
} }
@ -133,17 +155,28 @@ public class DeviantartRipper extends AbstractHTMLRipper {
if (img.attr("transparent").equals("false")) { if (img.attr("transparent").equals("false")) {
continue; // a.thumbs to other albums are invisible continue; // a.thumbs to other albums are invisible
} }
// Get full-sized image via helper methods // Get full-sized image via helper methods
String fullSize = null; String fullSize = null;
try { if (thumb.attr("data-super-full-img").contains("//orig")) {
fullSize = thumbToFull(img.attr("src"), true); fullSize = thumb.attr("data-super-full-img");
} catch (Exception e) { } else {
logger.info("Attempting to get full size image from " + thumb.attr("href")); String spanUrl = thumb.attr("href");
fullSize = smallToFull(img.attr("src"), thumb.attr("href")); String fullSize1 = jsonToImage(page,spanUrl.substring(spanUrl.lastIndexOf('-') + 1));
if (fullSize1 == null || !fullSize1.contains("//orig")) {
fullSize = smallToFull(img.attr("src"), spanUrl);
}
if (fullSize == null && fullSize1 != null) {
fullSize = fullSize1;
}
} }
if (fullSize == null) { if (fullSize == null) {
continue; if (thumb.attr("data-super-full-img") != null) {
fullSize = thumb.attr("data-super-full-img");
} else if (thumb.attr("data-super-img") != null) {
fullSize = thumb.attr("data-super-img");
} else {
continue;
}
} }
if (triedURLs.contains(fullSize)) { if (triedURLs.contains(fullSize)) {
logger.warn("Already tried to download " + fullSize); logger.warn("Already tried to download " + fullSize);
@ -162,9 +195,9 @@ public class DeviantartRipper extends AbstractHTMLRipper {
@Override @Override
public List<String> getDescriptionsFromPage(Document page) { public List<String> getDescriptionsFromPage(Document page) {
List<String> textURLs = new ArrayList<String>(); List<String> textURLs = new ArrayList<String>();
// Iterate over all thumbnails // Iterate over all thumbnails
for (Element thumb : page.select("div.zones-container a.thumb")) { for (Element thumb : page.select("div.zones-container span.thumb")) {
logger.info(thumb.attr("href"));
if (isStopped()) { if (isStopped()) {
break; break;
} }
@ -173,6 +206,7 @@ public class DeviantartRipper extends AbstractHTMLRipper {
continue; // a.thumbs to other albums are invisible continue; // a.thumbs to other albums are invisible
} }
textURLs.add(thumb.attr("href")); textURLs.add(thumb.attr("href"));
} }
return textURLs; return textURLs;
} }
@ -181,14 +215,15 @@ public class DeviantartRipper extends AbstractHTMLRipper {
if (isThisATest()) { if (isThisATest()) {
return null; return null;
} }
Elements nextButtons = page.select("li.next > a"); Elements nextButtons = page.select("link[rel=\"next\"]");
if (nextButtons.size() == 0) { if (nextButtons.size() == 0) {
throw new IOException("No next page found"); if (page.select("link[rel=\"prev\"]").size() == 0) {
throw new IOException("No next page found");
} else {
throw new IOException("Hit end of pages");
}
} }
Element a = nextButtons.first(); Element a = nextButtons.first();
if (a.hasClass("disabled")) {
throw new IOException("Hit end of pages");
}
String nextPage = a.attr("href"); String nextPage = a.attr("href");
if (nextPage.startsWith("/")) { if (nextPage.startsWith("/")) {
nextPage = "http://" + this.url.getHost() + nextPage; nextPage = "http://" + this.url.getHost() + nextPage;
@ -244,36 +279,54 @@ public class DeviantartRipper extends AbstractHTMLRipper {
* Attempts to download description for image. * Attempts to download description for image.
* Comes in handy when people put entire stories in their description. * Comes in handy when people put entire stories in their description.
* If no description was found, returns null. * If no description was found, returns null.
* @param page The page the description will be retrieved from * @param url The URL the description will be retrieved from
* @return The description * @param page The gallery page the URL was found on
* @return A String[] with first object being the description, and the second object being image file name if found.
*/ */
@Override @Override
public String getDescription(String page) { public String[] getDescription(String url,Document page) {
if (isThisATest()) { if (isThisATest()) {
return null; return null;
} }
try { try {
// Fetch the image page // Fetch the image page
Response resp = Http.url(page) Response resp = Http.url(url)
.referrer(this.url) .referrer(this.url)
.cookies(cookies) .cookies(cookies)
.response(); .response();
cookies.putAll(resp.cookies()); cookies.putAll(resp.cookies());
// Try to find the description // Try to find the description
Elements els = resp.parse().select("div[class=dev-description]"); Document documentz = resp.parse();
if (els.size() == 0) { Element ele = documentz.select("div.dev-description").first();
if (ele == null) {
throw new IOException("No description found"); throw new IOException("No description found");
} }
Document documentz = resp.parse();
Element ele = documentz.select("div[class=dev-description]").get(0);
documentz.outputSettings(new Document.OutputSettings().prettyPrint(false)); documentz.outputSettings(new Document.OutputSettings().prettyPrint(false));
ele.select("br").append("\\n"); ele.select("br").append("\\n");
ele.select("p").prepend("\\n\\n"); ele.select("p").prepend("\\n\\n");
return Jsoup.clean(ele.html().replaceAll("\\\\n", System.getProperty("line.separator")), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); String fullSize = null;
Element thumb = page.select("div.zones-container span.thumb[href=\"" + url + "\"]").get(0);
if (!thumb.attr("data-super-full-img").isEmpty()) {
fullSize = thumb.attr("data-super-full-img");
String[] split = fullSize.split("/");
fullSize = split[split.length - 1];
} else {
String spanUrl = thumb.attr("href");
fullSize = jsonToImage(page,spanUrl.substring(spanUrl.lastIndexOf('-') + 1));
if (fullSize != null) {
String[] split = fullSize.split("/");
fullSize = split[split.length - 1];
}
}
if (fullSize == null) {
return new String[] {Jsoup.clean(ele.html().replaceAll("\\\\n", System.getProperty("line.separator")), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false))};
}
fullSize = fullSize.substring(0, fullSize.lastIndexOf("."));
return new String[] {Jsoup.clean(ele.html().replaceAll("\\\\n", System.getProperty("line.separator")), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)),fullSize};
// TODO Make this not make a newline if someone just types \n into the description. // TODO Make this not make a newline if someone just types \n into the description.
} catch (IOException ioe) { } catch (IOException ioe) {
logger.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'"); logger.info("Failed to get description at " + url + ": '" + ioe.getMessage() + "'");
return null; return null;
} }
} }
@ -294,23 +347,44 @@ public class DeviantartRipper extends AbstractHTMLRipper {
.cookies(cookies) .cookies(cookies)
.response(); .response();
cookies.putAll(resp.cookies()); cookies.putAll(resp.cookies());
// Try to find the download button
Document doc = resp.parse(); Document doc = resp.parse();
Elements els = doc.select("a.dev-page-download"); Elements els = doc.select("img.dev-content-full");
if (els.size() > 0) { String fsimage = null;
// Full-size image
String fsimage = els.get(0).attr("href");
logger.info("Found download page: " + fsimage);
return fsimage;
}
// Get the largest resolution image on the page // Get the largest resolution image on the page
els = doc.select("img.dev-content-full");
if (els.size() > 0) { if (els.size() > 0) {
// Large image // Large image
String fsimage = els.get(0).attr("src"); fsimage = els.get(0).attr("src");
logger.info("Found large-scale: " + fsimage); logger.info("Found large-scale: " + fsimage);
if (fsimage.contains("//orig")) {
return fsimage;
}
}
// Try to find the download button
els = doc.select("a.dev-page-download");
if (els.size() > 0) {
// Full-size image
String downloadLink = els.get(0).attr("href");
logger.info("Found download button link: " + downloadLink);
HttpURLConnection con = (HttpURLConnection) new URL(downloadLink).openConnection();
con.setRequestProperty("Referer",this.url.toString());
String cookieString = "";
for (Map.Entry<String, String> entry : cookies.entrySet()) {
cookieString = cookieString + entry.getKey() + "=" + entry.getValue() + "; ";
}
cookieString = cookieString.substring(0,cookieString.length() - 1);
con.setRequestProperty("Cookie",cookieString);
con.setRequestProperty("User-Agent",this.USER_AGENT);
con.setInstanceFollowRedirects(true);
con.connect();
int code = con.getResponseCode();
String location = con.getURL().toString();
con.disconnect();
if (location.contains("//orig")) {
fsimage = location;
logger.info("Found image download: " + location);
}
}
if (fsimage != null) {
return fsimage; return fsimage;
} }
throw new IOException("No download page found"); throw new IOException("No download page found");