From cf7fbb9d3e1eb09f98c57b57fd545befbcd9d4ac Mon Sep 17 00:00:00 2001 From: 4pr0n Date: Sat, 31 May 2014 23:53:35 -0700 Subject: [PATCH] 1.0.49 - Ehentai ripper revamped For #41: * Multi-threading * Longer wait periods * Rate-limit checks & retries * New method for retrieving image from page --- pom.xml | 2 +- .../ripme/ripper/rippers/EHentaiRipper.java | 227 +++++++++++++----- .../com/rarchives/ripme/ui/UpdateUtils.java | 2 +- 3 files changed, 170 insertions(+), 61 deletions(-) diff --git a/pom.xml b/pom.xml index 294671a9..75bfff8c 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ com.rarchives.ripme ripme jar - 1.0.48 + 1.0.49 ripme http://rip.rarchives.com diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/EHentaiRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/EHentaiRipper.java index 592a4cb7..4915a2e8 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/EHentaiRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/EHentaiRipper.java @@ -14,12 +14,23 @@ import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.rarchives.ripme.ripper.AlbumRipper; +import com.rarchives.ripme.ripper.DownloadThreadPool; +import com.rarchives.ripme.ui.RipStatusMessage.STATUS; import com.rarchives.ripme.utils.Utils; public class EHentaiRipper extends AlbumRipper { + // All sleep times are in milliseconds + private static final int PAGE_SLEEP_TIME = 3 * 1000; + private static final int IMAGE_SLEEP_TIME = 1 * 1000; + private static final int IP_BLOCK_SLEEP_TIME = 60 * 1000; + private static final String DOMAIN = "g.e-hentai.org", HOST = "e-hentai"; private static final Logger logger = Logger.getLogger(EHentaiRipper.class); + // Thread pool for finding direct image links from "image" pages (html) + private DownloadThreadPool ehentaiThreadPool = new DownloadThreadPool("ehentai"); + + // Current HTML document private Document albumDoc = null; public EHentaiRipper(URL url) throws IOException { @@ -39,13 +50,18 @@ public class EHentaiRipper extends AlbumRipper { try { // Attempt to use album title as GID if (albumDoc == null) { - albumDoc = Jsoup.connect(url.toExternalForm()).get(); + logger.info(" Retrieving " + url.toExternalForm()); + sendUpdate(STATUS.LOADING_RESOURCE, url.toString()); + albumDoc = Jsoup.connect(url.toExternalForm()) + .userAgent(USER_AGENT) + .timeout(5000) + .get(); } Elements elems = albumDoc.select("#gn"); return HOST + "_" + elems.get(0).text(); } catch (Exception e) { // Fall back to default album naming convention - e.printStackTrace(); + logger.warn("Failed to get album title from " + url, e); } return super.getAlbumTitle(url); } @@ -55,8 +71,6 @@ public class EHentaiRipper extends AlbumRipper { Pattern p; Matcher m; - System.out.println(url); - p = Pattern.compile("^.*g\\.e-hentai\\.org/g/([0-9]+)/([a-fA-F0-9]+)/$"); m = p.matcher(url.toExternalForm()); if (m.matches()) { @@ -71,67 +85,76 @@ public class EHentaiRipper extends AlbumRipper { @Override public void rip() throws IOException { - int index = 0; - if (albumDoc == null) { - logger.info(" Retrieving " + this.url.toExternalForm()); - albumDoc = Jsoup.connect(this.url.toExternalForm()).get(); - } - Elements select = albumDoc.select("#gdt > .gdtm"); - Element first = select.first(); - String href = first.select("a").attr("href"); - if (href.equals("")) { - throw new IOException("Could not find 'href' inside elements under #gdt > .gdtm > a"); - } - URL cursorUrl = new URL(href), prevUrl = null; - - while (!cursorUrl.equals(prevUrl)) { - prevUrl = cursorUrl; - Document cursorDoc = Jsoup.connect(cursorUrl.toExternalForm()) - .userAgent(USER_AGENT) - .get(); - - Elements a = cursorDoc.select(".sni > a"); - Elements images = a.select("img"); - if (images.size() == 0) { - logger.info("cursorDoc: " + cursorDoc.toString()); - logger.error("No images found at " + cursorUrl); - break; + int index = 0, retries = 3; + String nextUrl = this.url.toExternalForm(); + while (true) { + if (albumDoc == null) { + logger.info(" Retrieving album page " + nextUrl); + sendUpdate(STATUS.LOADING_RESOURCE, nextUrl); + albumDoc = Jsoup.connect(nextUrl) + .userAgent(USER_AGENT) + .timeout(5000) + .referrer(this.url.toExternalForm()) + .get(); } - - String imgsrc = images.get(0).attr("src"); - if (imgsrc.equals("")) { - logger.warn("Image URL is empty via " + images.get(0)); + // Check for rate limiting + if (albumDoc.toString().contains("IP address will be automatically banned")) { + if (retries == 0) { + logger.error("Hit rate limit and maximum number of retries, giving up"); + break; + } + logger.warn("Hit rate limit while loading " + nextUrl + ", sleeping for " + IP_BLOCK_SLEEP_TIME + "ms, " + retries + " retries remaining"); + retries--; + try { + Thread.sleep(IP_BLOCK_SLEEP_TIME); + } catch (InterruptedException e) { + logger.error("Interrupted while waiting for rate limit to subside", e); + break; + } + albumDoc = null; continue; } - logger.info("Found URL " + imgsrc + " via " + images.get(0)); - Pattern p = Pattern.compile("^http://.*/ehg/image.php.*&n=([^&]+).*$"); - Matcher m = p.matcher(imgsrc); - if (m.matches()) { - // Manually discover filename from URL - String savePath = this.workingDir + File.separator; - if (Utils.getConfigBoolean("download.save_order", true)) { - savePath += String.format("%03d_", index + 1); - } - savePath += m.group(1); - addURLToDownload(new URL(imgsrc), new File(savePath)); - } - else { - // Provide prefix and let the AbstractRipper "guess" the filename - String prefix = ""; - if (Utils.getConfigBoolean("download.save_order", true)) { - prefix = String.format("%03d_", index + 1); - } - addURLToDownload(new URL(imgsrc), prefix); - } - - String nextUrl = a.attr("href"); - if (nextUrl.equals("")) { - logger.warn("Next page URL is empty, via " + a); + // Find thumbnails + Elements thumbs = albumDoc.select("#gdt > .gdtm a"); + if (thumbs.size() == 0) { + logger.info("albumDoc: " + albumDoc); + logger.info("No images found at " + nextUrl); break; } - cursorUrl = new URL(nextUrl); + // Iterate over images on page + for (Element thumb : thumbs) { + index++; + EHentaiImageThread t = new EHentaiImageThread(new URL(thumb.attr("href")), index, this.workingDir); + ehentaiThreadPool.addThread(t); + try { + Thread.sleep(IMAGE_SLEEP_TIME); + } catch (InterruptedException e) { + logger.warn("Interrupted while waiting to load next image", e); + } + } + // Find next page + Elements hrefs = albumDoc.select(".ptt a"); + if (hrefs.size() == 0) { + logger.info("No navigation links found at " + nextUrl); + break; + } + // Ensure next page is different from the current page + String lastUrl = nextUrl; + nextUrl = hrefs.last().attr("href"); + if (lastUrl.equals(nextUrl)) { + break; // We're on the last page + } - index++; + // Reset albumDoc so we fetch the page next time + albumDoc = null; + + // Sleep before loading next page + try { + Thread.sleep(PAGE_SLEEP_TIME); + } catch (InterruptedException e) { + logger.error("Interrupted while waiting to load next page", e); + break; + } } waitForThreads(); @@ -140,4 +163,90 @@ public class EHentaiRipper extends AlbumRipper { public boolean canRip(URL url) { return url.getHost().endsWith(DOMAIN); } + + /** + * Helper class to find and download images found on "image" pages + * + * Handles case when site has IP-banned the user. + */ + private class EHentaiImageThread extends Thread { + private URL url; + private int index; + private File workingDir; + private int retries = 3; + + public EHentaiImageThread(URL url, int index, File workingDir) { + super(); + this.url = url; + this.index = index; + this.workingDir = workingDir; + } + + @Override + public void run() { + fetchImage(); + } + + private void fetchImage() { + try { + Document doc = Jsoup.connect(this.url.toExternalForm()) + .userAgent(USER_AGENT) + .timeout(5000) + .referrer(this.url.toExternalForm()) + .get(); + // Check for rate limit + if (doc.toString().contains("IP address will be automatically banned")) { + if (this.retries == 0) { + logger.error("Rate limited & ran out of retries, skipping image at " + this.url); + return; + } + logger.warn("Hit rate limit. Sleeping for " + IP_BLOCK_SLEEP_TIME + "ms"); + try { + Thread.sleep(IP_BLOCK_SLEEP_TIME); + } catch (InterruptedException e) { + logger.error("Interrupted while waiting for rate limit to subside", e); + return; + } + this.retries--; + fetchImage(); // Re-attempt to download the image + return; + } + + // Find image + Elements images = doc.select(".sni > a > img"); + if (images.size() == 0) { + // Attempt to find image elsewise (Issue #41) + images = doc.select("img#img"); + if (images.size() == 0) { + logger.warn("Image not found at " + this.url); + return; + } + } + Element image = images.first(); + String imgsrc = image.attr("src"); + logger.info("Found URL " + imgsrc + " via " + images.get(0)); + Pattern p = Pattern.compile("^http://.*/ehg/image.php.*&n=([^&]+).*$"); + Matcher m = p.matcher(imgsrc); + if (m.matches()) { + // Manually discover filename from URL + String savePath = this.workingDir + File.separator; + if (Utils.getConfigBoolean("download.save_order", true)) { + savePath += String.format("%03d_", index); + } + savePath += m.group(1); + addURLToDownload(new URL(imgsrc), new File(savePath)); + } + else { + // Provide prefix and let the AbstractRipper "guess" the filename + String prefix = ""; + if (Utils.getConfigBoolean("download.save_order", true)) { + prefix = String.format("%03d_", index); + } + addURLToDownload(new URL(imgsrc), prefix); + } + } catch (IOException e) { + logger.error("[!] Exception while loading/parsing " + this.url, e); + } + } + } } \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ui/UpdateUtils.java b/src/main/java/com/rarchives/ripme/ui/UpdateUtils.java index e5086b45..93c6d54f 100644 --- a/src/main/java/com/rarchives/ripme/ui/UpdateUtils.java +++ b/src/main/java/com/rarchives/ripme/ui/UpdateUtils.java @@ -21,7 +21,7 @@ import com.rarchives.ripme.utils.Utils; public class UpdateUtils { private static final Logger logger = Logger.getLogger(UpdateUtils.class); - private static final String DEFAULT_VERSION = "1.0.48"; + private static final String DEFAULT_VERSION = "1.0.49"; private static final String updateJsonURL = "http://rarchives.com/ripme.json"; private static final String updateJarURL = "http://rarchives.com/ripme.jar"; private static final String mainFileName = "ripme.jar";