Improved description ripping for deviantART

Also, added some functions to AbstractHTMLRipper that should allow
description ripping to be added to other rippers as well.
hasDescriptionSupport() is a function that will be overridden by a
ripper that supports descriptions, and will trigger the description
ripper. getDescription will grab the description from a page, and must
be overridden if you want to grab a description with a ripper.
This commit is contained in:
Wiiplay123 2014-11-28 22:59:39 -06:00
parent 72b40394ae
commit 16e0d27f66
3 changed files with 124 additions and 50 deletions

View File

@ -1,5 +1,7 @@
package com.rarchives.ripme.ripper; package com.rarchives.ripme.ripper;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
@ -27,6 +29,9 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
throw new IOException("getNextPage not implemented"); throw new IOException("getNextPage not implemented");
} }
public abstract List<String> getURLsFromPage(Document page); public abstract List<String> getURLsFromPage(Document page);
public List<String> getDescriptionsFromPage(Document doc) throws IOException {
throw new IOException("getDescriptionsFromPage not implemented"); // Do I do this or make an abstract function?
}
public abstract void downloadURL(URL url, int index); public abstract void downloadURL(URL url, int index);
public DownloadThreadPool getThreadPool() { public DownloadThreadPool getThreadPool() {
return null; return null;
@ -45,10 +50,16 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
public URL sanitizeURL(URL url) throws MalformedURLException { public URL sanitizeURL(URL url) throws MalformedURLException {
return url; return url;
} }
public boolean hasDescriptionSupport() {
return false;
}
public String getDescription(String page) throws IOException {
throw new IOException("getDescription not implemented"); // Do I do this or make an abstract function?
}
@Override @Override
public void rip() throws IOException { public void rip() throws IOException {
int index = 0; int index = 0;
int textindex = 0;
logger.info("Retrieving " + this.url); logger.info("Retrieving " + this.url);
sendUpdate(STATUS.LOADING_RESOURCE, this.url.toExternalForm()); sendUpdate(STATUS.LOADING_RESOURCE, this.url.toExternalForm());
Document doc = getFirstPage(); Document doc = getFirstPage();
@ -67,6 +78,21 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
index += 1; index += 1;
downloadURL(new URL(imageURL), index); downloadURL(new URL(imageURL), index);
} }
if (hasDescriptionSupport()) {
List<String> textURLs = getDescriptionsFromPage(doc);
if (textURLs.size() > 0) {
for (String textURL : textURLs) {
if (isStopped()) {
break;
}
textindex += 1;
String tempDesc = getDescription(textURL);
if (tempDesc != null) {
saveText(new URL(textURL), "", tempDesc, textindex);
}
}
}
}
if (isStopped()) { if (isStopped()) {
break; break;
@ -87,7 +113,46 @@ public abstract class AbstractHTMLRipper extends AlbumRipper {
} }
waitForThreads(); waitForThreads();
} }
public boolean saveText(URL url, String subdirectory, String text, int index) {
try {
stopCheck();
} catch (IOException e) {
return false;
}
String saveAs = url.toExternalForm();
saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1);
if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); }
if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); }
if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); }
if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); }
File saveFileAs;
try {
if (!subdirectory.equals("")) { // Not sure about this part
subdirectory = File.separator + subdirectory;
}
// TODO Get prefix working again, probably requires reworking a lot of stuff!
saveFileAs = new File(
workingDir.getCanonicalPath()
+ subdirectory
+ File.separator
+ getPrefix(index)
+ saveAs
+ ".txt");
// Write the file
FileOutputStream out = (new FileOutputStream(saveFileAs));
out.write(text.getBytes());
out.close();
} catch (IOException e) {
logger.error("[!] Error creating save file path for description '" + url + "':", e);
return false;
}
logger.debug("Downloading " + url + "'s description to " + saveFileAs);
if (!saveFileAs.getParentFile().exists()) {
logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent()));
saveFileAs.getParentFile().mkdirs();
}
return true;
}
public String getPrefix(int index) { public String getPrefix(int index) {
String prefix = ""; String prefix = "";
if (keepSortOrder() && Utils.getConfigBoolean("download.save_order", true)) { if (keepSortOrder() && Utils.getConfigBoolean("download.save_order", true)) {

View File

@ -2,7 +2,6 @@ package com.rarchives.ripme.ripper;
import java.awt.Desktop; import java.awt.Desktop;
import java.io.File; import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.lang.reflect.Constructor; import java.lang.reflect.Constructor;
import java.net.MalformedURLException; import java.net.MalformedURLException;
@ -132,46 +131,7 @@ public abstract class AbstractRipper
} }
return addURLToDownload(url, saveFileAs, referrer, cookies); return addURLToDownload(url, saveFileAs, referrer, cookies);
} }
public boolean saveText(URL url, String subdirectory, String referrer, Map<String,String> cookies, String text) {
try {
stopCheck();
} catch (IOException e) {
return false;
}
String saveAs = url.toExternalForm();
saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1);
if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); }
if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); }
if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); }
if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); }
File saveFileAs;
try {
if (!subdirectory.equals("")) {
subdirectory = File.separator + subdirectory;
}
// TODO Get prefix working again, probably requires reworking a lot of stuff!
saveFileAs = new File(
workingDir.getCanonicalPath()
+ subdirectory
// + prefix
+ File.separator
+ saveAs
+ ".txt");
// Write the file
FileOutputStream out = (new FileOutputStream(saveFileAs));
out.write(text.getBytes());
out.close();
} catch (IOException e) {
logger.error("[!] Error creating save file path for description '" + url + "':", e);
return false;
}
logger.debug("Downloading " + url + "'s description to " + saveFileAs);
if (!saveFileAs.getParentFile().exists()) {
logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent()));
saveFileAs.getParentFile().mkdirs();
}
return true;
}
/** /**
* Queues file to be downloaded and saved. With options. * Queues file to be downloaded and saved. With options.

View File

@ -43,7 +43,10 @@ public class DeviantartRipper extends AbstractHTMLRipper {
public String getDomain() { public String getDomain() {
return "deviantart.com"; return "deviantart.com";
} }
@Override
public boolean hasDescriptionSupport() {
return true;
}
@Override @Override
public URL sanitizeURL(URL url) throws MalformedURLException { public URL sanitizeURL(URL url) throws MalformedURLException {
String u = url.toExternalForm(); String u = url.toExternalForm();
@ -118,7 +121,6 @@ public class DeviantartRipper extends AbstractHTMLRipper {
logger.info("Attempting to get full size image from " + thumb.attr("href")); logger.info("Attempting to get full size image from " + thumb.attr("href"));
fullSize = smallToFull(img.attr("src"), thumb.attr("href")); fullSize = smallToFull(img.attr("src"), thumb.attr("href"));
} }
if (fullSize == null) { if (fullSize == null) {
continue; continue;
} }
@ -131,7 +133,23 @@ public class DeviantartRipper extends AbstractHTMLRipper {
} }
return imageURLs; return imageURLs;
} }
@Override
public List<String> getDescriptionsFromPage(Document page) {
List<String> textURLs = new ArrayList<String>();
// Iterate over all thumbnails
for (Element thumb : page.select("div.zones-container a.thumb")) {
if (isStopped()) {
break;
}
Element img = thumb.select("img").get(0);
if (img.attr("transparent").equals("false")) {
continue; // a.thumbs to other albums are invisible
}
textURLs.add(thumb.attr("href"));
}
return textURLs;
}
@Override @Override
public Document getNextPage(Document page) throws IOException { public Document getNextPage(Document page) throws IOException {
Elements nextButtons = page.select("li.next > a"); Elements nextButtons = page.select("li.next > a");
@ -185,6 +203,37 @@ public class DeviantartRipper extends AbstractHTMLRipper {
return result.toString(); return result.toString();
} }
/**
* Attempts to download description for image.
* Comes in handy when people put entire stories in their description.
* If no description was found, returns null.
* @param page The page the description will be retrieved from
* @return The description
*/
@Override
public String getDescription(String page) {
try {
// Fetch the image page
Response resp = Http.url(page)
.referrer(this.url)
.cookies(cookies)
.response();
cookies.putAll(resp.cookies());
// Try to find the "Download" box
Elements els = resp.parse().select("div[class=dev-description]");
if (els.size() == 0) {
throw new IOException("No description found");
}
// Full-size image
String desc = els.text(); // TODO Figure out how to preserve newlines
return desc;
} catch (IOException ioe) {
logger.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'");
return null;
}
}
/** /**
* If largest resolution for image at 'thumb' is found, starts downloading * If largest resolution for image at 'thumb' is found, starts downloading
* and returns null. * and returns null.
@ -202,7 +251,7 @@ public class DeviantartRipper extends AbstractHTMLRipper {
.response(); .response();
cookies.putAll(resp.cookies()); cookies.putAll(resp.cookies());
// Try to find the "Download" box // Try to find the description
Elements els = resp.parse().select("a.dev-page-download"); Elements els = resp.parse().select("a.dev-page-download");
if (els.size() == 0) { if (els.size() == 0) {
throw new IOException("No download page found"); throw new IOException("No download page found");