Moving from MultiPage ripper to HTML ripper, added JSON ripper

This commit is contained in:
4pr0n 2014-06-22 19:12:29 -07:00
parent b7397cd31e
commit c166f93d57
12 changed files with 658 additions and 577 deletions

View File

@ -10,9 +10,9 @@ import org.jsoup.nodes.Document;
import com.rarchives.ripme.ui.RipStatusMessage.STATUS;
import com.rarchives.ripme.utils.Utils;
public abstract class AbstractMultiPageRipper extends AlbumRipper {
public abstract class AbstractHTMLRipper extends AlbumRipper {
public AbstractMultiPageRipper(URL url) throws IOException {
public AbstractHTMLRipper(URL url) throws IOException {
super(url);
}
@ -23,6 +23,9 @@ public abstract class AbstractMultiPageRipper extends AlbumRipper {
public abstract Document getNextPage(Document doc) throws IOException;
public abstract List<String> getURLsFromPage(Document page);
public abstract void downloadURL(URL url, int index);
public DownloadThreadPool getThreadPool() {
return null;
}
public boolean keepSortOrder() {
return true;
@ -54,19 +57,29 @@ public abstract class AbstractMultiPageRipper extends AlbumRipper {
for (String imageURL : imageURLs) {
if (isStopped()) {
logger.info("Interrupted");
break;
}
index += 1;
downloadURL(new URL(imageURL), index);
}
if (isStopped()) {
break;
}
try {
sendUpdate(STATUS.LOADING_RESOURCE, "next page");
doc = getNextPage(doc);
} catch (IOException e) {
logger.info("Can't get next page: " + e.getMessage());
break;
}
}
// If they're using a thread pool, wait for it.
if (getThreadPool() != null) {
getThreadPool().waitForThreads();
}
waitForThreads();
}

View File

@ -0,0 +1,93 @@
package com.rarchives.ripme.ripper;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;
import org.json.JSONObject;
import com.rarchives.ripme.ui.RipStatusMessage.STATUS;
import com.rarchives.ripme.utils.Utils;
public abstract class AbstractJSONRipper extends AlbumRipper {
public AbstractJSONRipper(URL url) throws IOException {
super(url);
}
public abstract String getDomain();
public abstract String getHost();
public abstract JSONObject getFirstPage() throws IOException;
public abstract JSONObject getNextPage(JSONObject json) throws IOException;
public abstract List<String> getURLsFromJSON(JSONObject json);
public abstract void downloadURL(URL url, int index);
public DownloadThreadPool getThreadPool() {
return null;
}
public boolean keepSortOrder() {
return true;
}
@Override
public boolean canRip(URL url) {
return url.getHost().endsWith(getDomain());
}
@Override
public URL sanitizeURL(URL url) throws MalformedURLException {
return url;
}
@Override
public void rip() throws IOException {
int index = 0;
logger.info("Retrieving " + this.url);
sendUpdate(STATUS.LOADING_RESOURCE, this.url.toExternalForm());
JSONObject json = getFirstPage();
while (json != null) {
List<String> imageURLs = getURLsFromJSON(json);
if (imageURLs.size() == 0) {
throw new IOException("No images found at " + this.url);
}
for (String imageURL : imageURLs) {
if (isStopped()) {
break;
}
index += 1;
downloadURL(new URL(imageURL), index);
}
if (isStopped()) {
break;
}
try {
sendUpdate(STATUS.LOADING_RESOURCE, "next page");
json = getNextPage(json);
} catch (IOException e) {
logger.info("Can't get next page: " + e.getMessage());
break;
}
}
// If they're using a thread pool, wait for it.
if (getThreadPool() != null) {
getThreadPool().waitForThreads();
}
waitForThreads();
}
public String getPrefix(int index) {
String prefix = "";
if (keepSortOrder() && Utils.getConfigBoolean("download.save_order", true)) {
prefix = String.format("%03d_", index);
}
return prefix;
}
}

View File

@ -24,7 +24,7 @@ public abstract class AbstractSinglePageRipper extends AlbumRipper {
public abstract void downloadURL(URL url, int index);
public boolean keepSortOrder() {
return false;
return true;
}
@Override

View File

@ -19,12 +19,12 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.rarchives.ripme.ripper.AbstractMultiPageRipper;
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.utils.Base64;
import com.rarchives.ripme.utils.Http;
import com.rarchives.ripme.utils.Utils;
public class DeviantartRipper extends AbstractMultiPageRipper {
public class DeviantartRipper extends AbstractHTMLRipper {
private static final int SLEEP_TIME = 2000;

View File

@ -12,10 +12,10 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.rarchives.ripme.ripper.AbstractMultiPageRipper;
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.utils.Http;
public class DrawcrowdRipper extends AbstractMultiPageRipper {
public class DrawcrowdRipper extends AbstractHTMLRipper {
public DrawcrowdRipper(URL url) throws IOException {
super(url);

View File

@ -4,7 +4,9 @@ import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -13,22 +15,26 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.rarchives.ripme.ripper.AlbumRipper;
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.ripper.DownloadThreadPool;
import com.rarchives.ripme.ui.RipStatusMessage.STATUS;
import com.rarchives.ripme.utils.Http;
import com.rarchives.ripme.utils.Utils;
public class EHentaiRipper extends AlbumRipper {
public class EHentaiRipper extends AbstractHTMLRipper {
// All sleep times are in milliseconds
private static final int PAGE_SLEEP_TIME = 3 * 1000;
private static final int IMAGE_SLEEP_TIME = 1 * 1000;
private static final int IP_BLOCK_SLEEP_TIME = 60 * 1000;
private static final int PAGE_SLEEP_TIME = 3000;
private static final int IMAGE_SLEEP_TIME = 1500;
private static final int IP_BLOCK_SLEEP_TIME = 60 * 1000;
private static final String DOMAIN = "g.e-hentai.org", HOST = "e-hentai";
private String lastURL = null;
// Thread pool for finding direct image links from "image" pages (html)
private DownloadThreadPool ehentaiThreadPool = new DownloadThreadPool("ehentai");
@Override
public DownloadThreadPool getThreadPool() {
return ehentaiThreadPool;
}
// Current HTML document
private Document albumDoc = null;
@ -45,25 +51,22 @@ public class EHentaiRipper extends AlbumRipper {
@Override
public String getHost() {
return HOST;
return "e-hentai";
}
public URL sanitizeURL(URL url) throws MalformedURLException {
return url;
@Override
public String getDomain() {
return "g.e-hentai.org";
}
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
if (albumDoc == null) {
sendUpdate(STATUS.LOADING_RESOURCE, url.toString());
logger.info("Retrieving " + url);
albumDoc = Http.url(url)
.cookies(cookies)
.get();
albumDoc = getPageWithRetries(url);
}
Elements elems = albumDoc.select("#gn");
return HOST + "_" + elems.get(0).text();
return getHost() + "_" + elems.first().text();
} catch (Exception e) {
// Fall back to default album naming convention
logger.warn("Failed to get album title from " + url, e);
@ -88,94 +91,96 @@ public class EHentaiRipper extends AlbumRipper {
+ " Got: " + url);
}
@Override
public void rip() throws IOException {
int index = 0, retries = 3;
String nextUrl = this.url.toExternalForm();
/**
* Attempts to get page, checks for IP ban, waits.
* @param url
* @return Page document
* @throws IOException If page loading errors, or if retries are exhausted
*/
private Document getPageWithRetries(URL url) throws IOException {
Document doc;
int retries = 3;
while (true) {
if (isStopped()) {
break;
}
if (albumDoc == null) {
logger.info(" Retrieving album page " + nextUrl);
sendUpdate(STATUS.LOADING_RESOURCE, nextUrl);
albumDoc = Http.url(nextUrl)
.referrer(this.url)
.cookies(cookies)
.get();
}
// Check for rate limiting
if (albumDoc.toString().contains("IP address will be automatically banned")) {
sendUpdate(STATUS.LOADING_RESOURCE, url.toExternalForm());
logger.info("Retrieving " + url);
doc = Http.url(url)
.referrer(this.url)
.cookies(cookies)
.get();
if (doc.toString().contains("IP address will be automatically banned")) {
if (retries == 0) {
logger.error("Hit rate limit and maximum number of retries, giving up");
break;
throw new IOException("Hit rate limit and maximum number of retries, giving up");
}
logger.warn("Hit rate limit while loading " + nextUrl + ", sleeping for " + IP_BLOCK_SLEEP_TIME + "ms, " + retries + " retries remaining");
logger.warn("Hit rate limit while loading " + url + ", sleeping for " + IP_BLOCK_SLEEP_TIME + "ms, " + retries + " retries remaining");
retries--;
try {
Thread.sleep(IP_BLOCK_SLEEP_TIME);
} catch (InterruptedException e) {
logger.error("Interrupted while waiting for rate limit to subside", e);
break;
}
albumDoc = null;
continue;
}
// Find thumbnails
Elements thumbs = albumDoc.select("#gdt > .gdtm a");
if (thumbs.size() == 0) {
logger.info("albumDoc: " + albumDoc);
logger.info("No images found at " + nextUrl);
break;
}
// Iterate over images on page
for (Element thumb : thumbs) {
if (isStopped()) {
break;
}
index++;
EHentaiImageThread t = new EHentaiImageThread(new URL(thumb.attr("href")), index, this.workingDir);
ehentaiThreadPool.addThread(t);
try {
Thread.sleep(IMAGE_SLEEP_TIME);
} catch (InterruptedException e) {
logger.warn("Interrupted while waiting to load next image", e);
throw new IOException("Interrupted while waiting for rate limit to subside");
}
}
if (isStopped()) {
break;
}
// Find next page
Elements hrefs = albumDoc.select(".ptt a");
if (hrefs.size() == 0) {
logger.info("No navigation links found at " + nextUrl);
break;
}
// Ensure next page is different from the current page
String lastUrl = nextUrl;
nextUrl = hrefs.last().attr("href");
if (lastUrl.equals(nextUrl)) {
break; // We're on the last page
}
// Reset albumDoc so we fetch the page next time
albumDoc = null;
// Sleep before loading next page
try {
Thread.sleep(PAGE_SLEEP_TIME);
} catch (InterruptedException e) {
logger.error("Interrupted while waiting to load next page", e);
break;
else {
return doc;
}
}
waitForThreads();
}
public boolean canRip(URL url) {
return url.getHost().endsWith(DOMAIN);
@Override
public Document getFirstPage() throws IOException {
if (albumDoc == null) {
albumDoc = getPageWithRetries(this.url);
}
this.lastURL = this.url.toExternalForm();
return albumDoc;
}
@Override
public Document getNextPage(Document doc) throws IOException {
// Check if we've stopped
if (isStopped()) {
throw new IOException("Ripping interrupted");
}
// Find next page
Elements hrefs = doc.select(".ptt a");
if (hrefs.size() == 0) {
logger.info("doc: " + doc.html());
throw new IOException("No navigation links found");
}
// Ensure next page is different from the current page
String nextURL = hrefs.last().attr("href");
if (nextURL.equals(this.lastURL)) {
logger.info("lastURL = nextURL : " + nextURL);
throw new IOException("Reached last page of results");
}
// Sleep before loading next page
sleep(PAGE_SLEEP_TIME);
// Load next page
Document nextPage = getPageWithRetries(new URL(nextURL));
this.lastURL = nextURL;
return nextPage;
}
@Override
public List<String> getURLsFromPage(Document page) {
List<String> imageURLs = new ArrayList<String>();
Elements thumbs = page.select("#gdt > .gdtm a");
// Iterate over images on page
for (Element thumb : thumbs) {
imageURLs.add(thumb.attr("href"));
}
return imageURLs;
}
@Override
public void downloadURL(URL url, int index) {
EHentaiImageThread t = new EHentaiImageThread(url, index, this.workingDir);
ehentaiThreadPool.addThread(t);
try {
Thread.sleep(IMAGE_SLEEP_TIME);
}
catch (InterruptedException e) {
logger.warn("Interrupted while waiting to load next image", e);
}
}
/**
@ -187,7 +192,6 @@ public class EHentaiRipper extends AlbumRipper {
private URL url;
private int index;
private File workingDir;
private int retries = 3;
public EHentaiImageThread(URL url, int index, File workingDir) {
super();
@ -203,27 +207,7 @@ public class EHentaiRipper extends AlbumRipper {
private void fetchImage() {
try {
Document doc = Http.url(this.url)
.referrer(this.url)
.cookies(cookies)
.get();
// Check for rate limit
if (doc.toString().contains("IP address will be automatically banned")) {
if (this.retries == 0) {
logger.error("Rate limited & ran out of retries, skipping image at " + this.url);
return;
}
logger.warn("Hit rate limit. Sleeping for " + IP_BLOCK_SLEEP_TIME + "ms");
try {
Thread.sleep(IP_BLOCK_SLEEP_TIME);
} catch (InterruptedException e) {
logger.error("Interrupted while waiting for rate limit to subside", e);
return;
}
this.retries--;
fetchImage(); // Re-attempt to download the image
return;
}
Document doc = getPageWithRetries(this.url);
// Find image
Elements images = doc.select(".sni > a > img");

View File

@ -1,168 +1,41 @@
package com.rarchives.ripme.ripper.rippers;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.rarchives.ripme.ripper.AlbumRipper;
import com.rarchives.ripme.ripper.AbstractSinglePageRipper;
import com.rarchives.ripme.ui.RipStatusMessage.STATUS;
import com.rarchives.ripme.utils.Http;
import com.rarchives.ripme.utils.Utils;
public class EightmusesRipper extends AlbumRipper {
private static final String DOMAIN = "8muses.com",
HOST = "8muses";
public class EightmusesRipper extends AbstractSinglePageRipper {
private Document albumDoc = null;
private Map<String,String> cookies = new HashMap<String,String>();
public EightmusesRipper(URL url) throws IOException {
super(url);
}
@Override
public boolean canRip(URL url) {
return url.getHost().endsWith(DOMAIN);
}
@Override
public URL sanitizeURL(URL url) throws MalformedURLException {
return url;
}
@Override
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
if (albumDoc == null) {
albumDoc = Http.url(url).get();
}
Element titleElement = albumDoc.select("meta[name=description]").first();
String title = titleElement.attr("content");
title = title.substring(title.lastIndexOf('/') + 1);
return HOST + "_" + title.trim();
} catch (IOException e) {
// Fall back to default album naming convention
logger.info("Unable to find title at " + url);
}
return super.getAlbumTitle(url);
}
@Override
public void rip() throws IOException {
ripAlbum(this.url.toExternalForm(), this.workingDir);
waitForThreads();
}
private void ripAlbum(String url, File subdir) throws IOException {
logger.info(" Retrieving " + url);
sendUpdate(STATUS.LOADING_RESOURCE, url);
if (albumDoc == null) {
albumDoc = Http.url(url).get();
}
int index = 0; // Both album index and image index
if (albumDoc.select(".preview > span").size() > 0) {
// Page contains subalbums (not images)
for (Element subalbum : albumDoc.select("a.preview")) {
ripSubalbumFromPreview(subalbum, subdir, ++index);
}
}
else {
// Page contains images
for (Element thumb : albumDoc.select("img")) {
downloadImage(thumb, subdir, ++index);
}
}
}
/**
* @param subalbum Anchor element of a subalbum
* @throws IOException
*/
private void ripSubalbumFromPreview(Element subalbum, File subdir, int index) throws IOException {
// Find + sanitize URL from Element
String subUrl = subalbum.attr("href");
subUrl = subUrl.replaceAll("\\.\\./", "");
if (subUrl.startsWith("//")) {
subUrl = "http:";
}
else if (!subUrl.startsWith("http://")) {
subUrl = "http://www.8muses.com/" + subUrl;
}
// Prepend image index if enabled
// Get album title
String subTitle = subalbum.attr("alt");
if (subTitle.equals("")) {
subTitle = getGID(new URL(subUrl));
}
subTitle = Utils.filesystemSafe(subTitle);
// Create path to subdirectory
File subDir = new File(subdir.getAbsolutePath() + File.separator + subTitle);
if (!subDir.exists()) {
subDir.mkdirs();
}
albumDoc = null;
ripAlbum(subUrl, subDir);
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
logger.warn("Interrupted whiel waiting to load next album");
}
}
private void downloadImage(Element thumb, File subdir, int index) {
// Find thumbnail image source
String image = null;
if (thumb.hasAttr("data-cfsrc")) {
image = thumb.attr("data-cfsrc");
}
else if (thumb.hasAttr("src")) {
image = thumb.attr("src");
}
else {
logger.warn("Thumb does not havedata-cfsrc or src: " + thumb);
return;
}
// Remove relative directory path naming
image = image.replaceAll("\\.\\./", "");
if (image.startsWith("//")) {
image = "http:" + image;
}
// Convert from thumb URL to full-size
if (image.contains("-cu_")) {
image = image.replaceAll("-cu_[^.]+", "-me");
}
// Set download path
try {
URL imageURL = new URL(image);
String saveAs = subdir.getAbsolutePath() + File.separator;
if (Utils.getConfigBoolean("download.save_order", true)) {
// Append image index
saveAs += String.format("%03d_", index);
}
// Append image title
saveAs += Utils.filesystemSafe(thumb.attr("title"));
// Append extension
saveAs += image.substring(image.lastIndexOf('.'));
File saveFile = new File(saveAs);
// Download
addURLToDownload(imageURL, saveFile, thumb.baseUri(), null);
} catch (IOException e) {
logger.error("Failed to download image at " + image, e);
sendUpdate(STATUS.DOWNLOAD_ERRORED, "Failed to download image at " + image);
}
}
@Override
public String getHost() {
return HOST;
return "8muses";
}
@Override
public String getDomain() {
return "8muses.com";
}
@Override
@ -175,4 +48,100 @@ public class EightmusesRipper extends AlbumRipper {
return m.group(m.groupCount());
}
@Override
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
Element titleElement = getFirstPage().select("meta[name=description]").first();
String title = titleElement.attr("content");
title = title.substring(title.lastIndexOf('/') + 1);
return getHost() + "_" + title.trim();
} catch (IOException e) {
// Fall back to default album naming convention
logger.info("Unable to find title at " + url);
}
return super.getAlbumTitle(url);
}
@Override
public Document getFirstPage() throws IOException {
if (albumDoc == null) {
Response resp = Http.url(url).response();
cookies.putAll(resp.cookies());
albumDoc = resp.parse();
}
return albumDoc;
}
@Override
public List<String> getURLsFromPage(Document page) {
List<String> imageURLs = new ArrayList<String>();
if (page.select(".preview > span").size() > 0) {
// Page contains subalbums (not images)
Elements albumElements = page.select("a.preview");
List<Element> albumsList = albumElements.subList(0, albumElements.size());
Collections.reverse(albumsList);
// Iterate over elements in reverse order
for (Element subalbum : albumsList) {
String subUrl = subalbum.attr("href");
subUrl = subUrl.replaceAll("\\.\\./", "");
if (subUrl.startsWith("//")) {
subUrl = "http:";
}
else if (!subUrl.startsWith("http://")) {
subUrl = "http://www.8muses.com/" + subUrl;
}
try {
logger.info("Retrieving " + subUrl);
sendUpdate(STATUS.LOADING_RESOURCE, subUrl);
Document subPage = Http.url(subUrl).get();
// Get all images in subalbum, add to list.
List<String> subalbumImages = getURLsFromPage(subPage);
logger.info("Found " + subalbumImages.size() + " images in subalbum");
imageURLs.addAll(subalbumImages);
} catch (IOException e) {
logger.warn("Error while loading subalbum " + subUrl, e);
continue;
}
}
}
else {
// Page contains images
for (Element thumb : page.select("img")) {
// Find thumbnail image source
String image = null;
if (thumb.hasAttr("data-cfsrc")) {
image = thumb.attr("data-cfsrc");
}
else if (thumb.hasAttr("src")) {
image = thumb.attr("src");
}
else {
logger.warn("Thumb does not have data-cfsrc or src: " + thumb);
continue;
}
// Remove relative directory path naming
image = image.replaceAll("\\.\\./", "");
if (image.startsWith("//")) {
image = "http:" + image;
}
// Convert from thumb URL to full-size
if (image.contains("-cu_")) {
image = image.replaceAll("-cu_[^.]+", "-me");
}
imageURLs.add(image);
}
}
return imageURLs;
}
@Override
public void downloadURL(URL url, int index) {
addURLToDownload(url, getPrefix(index), "", this.url.toExternalForm(), cookies);
}
@Override
public String getPrefix(int index) {
return String.format("%03d_", index);
}
}

View File

@ -3,82 +3,33 @@ package com.rarchives.ripme.ripper.rippers;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import com.rarchives.ripme.ripper.AlbumRipper;
import com.rarchives.ripme.ui.RipStatusMessage.STATUS;
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.utils.Http;
import com.rarchives.ripme.utils.Utils;
public class FapprovedRipper extends AlbumRipper {
public class FapprovedRipper extends AbstractHTMLRipper {
private static final String DOMAIN = "fapproved.com",
HOST = "fapproved";
private int pageIndex = 1;
private String username = null;
public FapprovedRipper(URL url) throws IOException {
super(url);
}
@Override
public boolean canRip(URL url) {
return (url.getHost().endsWith(DOMAIN));
}
@Override
public URL sanitizeURL(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://fapproved\\.com/users/([a-zA-Z0-9\\-_]{1,}).*$");
Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) {
return new URL("http://fapproved.com/users/" + m.group(1));
}
throw new MalformedURLException("Expected username in URL (fapproved.com/users/username and not " + url);
}
@Override
public void rip() throws IOException {
int index = 0, page = 0;
String url, user = getGID(this.url);
boolean hasNextPage = true;
while (hasNextPage) {
page++;
url = "http://fapproved.com/users/" + user + "/images?page=" + page;
this.sendUpdate(STATUS.LOADING_RESOURCE, url);
logger.info(" Retrieving " + url);
Document doc = Http.url(url)
.ignoreContentType()
.get();
for (Element image : doc.select("div.actual-image img")) {
String imageUrl = image.attr("src");
if (imageUrl.startsWith("//")) {
imageUrl = "http:" + imageUrl;
}
index++;
String prefix = "";
if (Utils.getConfigBoolean("download.save_order", true)) {
prefix = String.format("%03d_", index);
}
addURLToDownload(new URL(imageUrl), prefix);
}
if ( (doc.select("div.pagination li.next.disabled").size() != 0)
|| (doc.select("div.pagination").size() == 0) ) {
break;
}
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
logger.error("[!] Interrupted while waiting to load next album:", e);
break;
}
}
waitForThreads();
}
@Override
public String getHost() {
return HOST;
return "fapproved";
}
@Override
public String getDomain() {
return "fapproved.com";
}
@Override
@ -86,9 +37,62 @@ public class FapprovedRipper extends AlbumRipper {
Pattern p = Pattern.compile("^https?://[w.]*fapproved.com/users/([a-zA-Z0-9\\-_]{3,}).*$");
Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) {
return m.group(1);
username = m.group(1);
return username;
}
throw new MalformedURLException("Fapproved user not found in " + url + ", expected http://fapproved.com/users/username/images");
}
@Override
public URL sanitizeURL(URL url) throws MalformedURLException {
return new URL("http://fapproved.com/users/" + getGID(url));
}
@Override
public Document getFirstPage() throws IOException {
pageIndex = 1;
String pageURL = getPageURL(pageIndex);
return Http.url(pageURL)
.ignoreContentType()
.get();
}
@Override
public Document getNextPage(Document doc) throws IOException {
if ( (doc.select("div.pagination li.next.disabled").size() != 0)
|| (doc.select("div.pagination").size() == 0) ) {
throw new IOException("No more pages found");
}
sleep(1000);
pageIndex++;
String pageURL = getPageURL(pageIndex);
return Http.url(pageURL)
.ignoreContentType()
.get();
}
private String getPageURL(int index) throws IOException {
if (username == null) {
username = getGID(this.url);
}
return "http://fapproved.com/users/" + username + "/images?page=" + pageIndex;
}
@Override
public List<String> getURLsFromPage(Document page) {
List<String> imageURLs = new ArrayList<String>();
for (Element image : page.select("div.actual-image img")) {
String imageURL = image.attr("src");
if (imageURL.startsWith("//")) {
imageURL = "http:" + imageURL;
}
imageURLs.add(imageURL);
}
return imageURLs;
}
@Override
public void downloadURL(URL url, int index) {
addURLToDownload(url, getPrefix(index));
}
}

View File

@ -3,8 +3,10 @@ package com.rarchives.ripme.ripper.rippers;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
@ -17,19 +19,22 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.rarchives.ripme.ripper.AlbumRipper;
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.ripper.DownloadThreadPool;
import com.rarchives.ripme.utils.Base64;
import com.rarchives.ripme.utils.Http;
import com.rarchives.ripme.utils.Utils;
public class FlickrRipper extends AlbumRipper {
public class FlickrRipper extends AbstractHTMLRipper {
private static final String DOMAIN = "flickr.com",
HOST = "flickr";
private DownloadThreadPool flickrThreadPool;
private int page = 1;
private Set<String> attempted = new HashSet<String>();
private Document albumDoc = null;
private DownloadThreadPool flickrThreadPool;
@Override
public DownloadThreadPool getThreadPool() {
return flickrThreadPool;
}
public FlickrRipper(URL url) throws IOException {
super(url);
@ -38,7 +43,11 @@ public class FlickrRipper extends AlbumRipper {
@Override
public String getHost() {
return HOST;
return "flickr";
}
@Override
public String getDomain() {
return "flickr.com";
}
public URL sanitizeURL(URL url) throws MalformedURLException {
@ -61,15 +70,13 @@ public class FlickrRipper extends AlbumRipper {
}
try {
// Attempt to use album title as GID
if (albumDoc == null) {
albumDoc = Http.url(url).get();
}
Document doc = getFirstPage();
String user = url.toExternalForm();
user = user.substring(user.indexOf("/photos/") + "/photos/".length());
user = user.substring(0, user.indexOf("/"));
String title = albumDoc.select("meta[name=description]").get(0).attr("content");
String title = doc.select("meta[name=description]").get(0).attr("content");
if (!title.equals("")) {
return HOST + "_" + user + "_" + title;
return getHost() + "_" + user + "_" + title;
}
} catch (Exception e) {
// Fall back to default album naming convention
@ -114,79 +121,79 @@ public class FlickrRipper extends AlbumRipper {
}
@Override
public void rip() throws IOException {
//Map<String,String> cookies = signinToFlickr();
Set<String> attempted = new HashSet<String>();
int index = 0, page = 1;
String nextURL = this.url.toExternalForm();
while (true) {
if (isStopped()) {
break;
}
logger.info(" Retrieving " + nextURL);
if (albumDoc == null) {
albumDoc = Http.url(nextURL).get();
}
for (Element thumb : albumDoc.select("a[data-track=photo-click]")) {
String imageTitle = null;
if (thumb.hasAttr("title")) {
imageTitle = thumb.attr("title");
}
String imagePage = thumb.attr("href");
if (imagePage.startsWith("/")) {
imagePage = "http://www.flickr.com" + imagePage;
}
if (imagePage.contains("/in/")) {
imagePage = imagePage.substring(0, imagePage.indexOf("/in/") + 1);
}
if (!imagePage.endsWith("/")) {
imagePage += "/";
}
imagePage += "sizes/o/";
// Check for duplicates
if (attempted.contains(imagePage)) {
continue;
}
attempted.add(imagePage);
index += 1;
// Add image page to threadpool to grab the image & download it
FlickrImageThread mit = new FlickrImageThread(new URL(imagePage), imageTitle, index);
flickrThreadPool.addThread(mit);
}
// Find how many pages there are
int lastPage = 0;
for (Element apage : albumDoc.select("a[data-track^=page-]")) {
String lastPageStr = apage.attr("data-track").replace("page-", "");
lastPage = Integer.parseInt(lastPageStr);
}
// If we're at the last page, stop.
if (page >= lastPage) {
break;
}
// Load the next page
page++;
albumDoc = null;
nextURL = this.url.toExternalForm();
if (!nextURL.endsWith("/")) {
nextURL += "/";
}
nextURL += "page" + page + "/";
// Wait a bit
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
logger.error("Interrupted while waiting to load next page " + nextURL, e);
break;
}
public Document getFirstPage() throws IOException {
if (albumDoc == null) {
albumDoc = Http.url(url).get();
}
flickrThreadPool.waitForThreads();
waitForThreads();
return albumDoc;
}
public boolean canRip(URL url) {
return url.getHost().endsWith(DOMAIN);
@Override
public Document getNextPage(Document doc) throws IOException {
// Find how many pages there are
int lastPage = 0;
for (Element apage : doc.select("a[data-track^=page-]")) {
String lastPageStr = apage.attr("data-track").replace("page-", "");
lastPage = Integer.parseInt(lastPageStr);
}
// If we're at the last page, stop.
if (page >= lastPage) {
throw new IOException("No more pages");
}
// Load the next page
page++;
albumDoc = null;
String nextURL = this.url.toExternalForm();
if (!nextURL.endsWith("/")) {
nextURL += "/";
}
nextURL += "page" + page + "/";
// Wait a bit
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
throw new IOException("Interrupted while waiting to load next page " + nextURL);
}
return Http.url(nextURL).get();
}
@Override
public List<String> getURLsFromPage(Document page) {
List<String> imageURLs = new ArrayList<String>();
for (Element thumb : page.select("a[data-track=photo-click]")) {
/* TODO find a way to persist the image title
String imageTitle = null;
if (thumb.hasAttr("title")) {
imageTitle = thumb.attr("title");
}
*/
String imagePage = thumb.attr("href");
if (imagePage.startsWith("/")) {
imagePage = "http://www.flickr.com" + imagePage;
}
if (imagePage.contains("/in/")) {
imagePage = imagePage.substring(0, imagePage.indexOf("/in/") + 1);
}
if (!imagePage.endsWith("/")) {
imagePage += "/";
}
imagePage += "sizes/o/";
// Check for duplicates
if (attempted.contains(imagePage)) {
continue;
}
attempted.add(imagePage);
imageURLs.add(imagePage);
}
return imageURLs;
}
@Override
public void downloadURL(URL url, int index) {
// Add image page to threadpool to grab the image & download it
FlickrImageThread mit = new FlickrImageThread(url, index);
flickrThreadPool.addThread(mit);
}
/**
@ -224,13 +231,11 @@ public class FlickrRipper extends AlbumRipper {
*/
private class FlickrImageThread extends Thread {
private URL url;
private String title;
private int index;
public FlickrImageThread(URL url, String title, int index) {
public FlickrImageThread(URL url, int index) {
super();
this.url = url;
this.title = title;
this.index = index;
}
@ -248,9 +253,8 @@ public class FlickrRipper extends AlbumRipper {
if (Utils.getConfigBoolean("download.save_order", true)) {
prefix = String.format("%03d_", index);
}
prefix += Utils.filesystemSafe(title);
synchronized (flickrThreadPool) {
addURLToDownload(new URL(fullsizeImages.get(0).attr("src")), prefix);
addURLToDownload(new URL(fullsizeImages.first().attr("src")), prefix);
}
}
} catch (IOException e) {

View File

@ -3,7 +3,9 @@ package com.rarchives.ripme.ripper.rippers;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -12,99 +14,27 @@ import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.rarchives.ripme.ripper.AlbumRipper;
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.ui.RipStatusMessage.STATUS;
import com.rarchives.ripme.utils.Http;
public class GifyoRipper extends AlbumRipper {
public class GifyoRipper extends AbstractHTMLRipper {
private static final String DOMAIN = "gifyo.com",
HOST = "gifyo";
private int page = 0;
private Map<String,String> cookies = new HashMap<String,String>();
public GifyoRipper(URL url) throws IOException {
super(url);
}
@Override
public boolean canRip(URL url) {
return (url.getHost().endsWith(DOMAIN));
}
@Override
public URL sanitizeURL(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://gifyo\\.com/([a-zA-Z0-9\\-_]+)/?$");
Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) {
return new URL("http://gifyo.com/" + m.group(1) + "/");
}
throw new MalformedURLException("Expected username in URL (gifyo.com/username/ and not " + url);
}
@Override
public void rip() throws IOException {
int page = 0;
Map<String,String> cookies = new HashMap<String,String>();
while (true) {
this.sendUpdate(STATUS.LOADING_RESOURCE, this.url.toExternalForm() + " (page #" + page + ")");
logger.info(" Retrieving " + this.url + "(page #" + page + ")");
Response resp = null;
if (page == 0) {
resp = Http.url(this.url)
.ignoreContentType()
.response();
cookies = resp.cookies();
}
else {
Map<String,String> postData = new HashMap<String,String>();
postData.put("cmd", "refreshData");
postData.put("view", "gif");
postData.put("layout", "grid");
postData.put("page", Integer.toString(page));
resp = Http.url(this.url)
.ignoreContentType()
.data(postData)
.cookies(cookies)
.method(Method.POST)
.response();
cookies.putAll(resp.cookies());
}
Document doc = resp.parse();
Elements images = doc.select("div.gif img");
logger.info("Found " + images.size() + " images");
for (Element image : images) {
String imageUrl = image.attr("src");
if (imageUrl.startsWith("//")) {
imageUrl = "http:" + imageUrl;
}
imageUrl = imageUrl.replace("/medium/", "/large/");
imageUrl = imageUrl.replace("_s.gif", ".gif");
addURLToDownload(new URL(imageUrl));
}
if (images.size() == 0) {
if (doc.html().contains("profile is private")) {
sendUpdate(STATUS.RIP_ERRORED, "User has private profile");
throw new IOException("User has private profile");
}
else {
logger.info("Page " + page + " has 0 images");
}
break;
}
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
logger.error("[!] Interrupted while waiting to load next album:", e);
break;
}
page++;
}
waitForThreads();
}
@Override
public String getHost() {
return HOST;
return "gifyo";
}
@Override
public String getDomain() {
return "gifyo.com";
}
@Override
@ -117,4 +47,67 @@ public class GifyoRipper extends AlbumRipper {
throw new MalformedURLException("Gifyo user not found in " + url + ", expected http://gifyo.com/username");
}
@Override
public URL sanitizeURL(URL url) throws MalformedURLException {
return new URL("http://gifyo.com/" + getGID(url) + "/");
}
@Override
public Document getFirstPage() throws IOException {
Response resp = Http.url(this.url)
.ignoreContentType()
.response();
cookies = resp.cookies();
Document doc = resp.parse();
if (doc.html().contains("profile is private")) {
sendUpdate(STATUS.RIP_ERRORED, "User has private profile");
throw new IOException("User has private profile");
}
return doc;
}
@Override
public Document getNextPage(Document doc) throws IOException {
page++;
Map<String,String> postData = new HashMap<String,String>();
postData.put("cmd", "refreshData");
postData.put("view", "gif");
postData.put("layout", "grid");
postData.put("page", Integer.toString(page));
Response resp = Http.url(this.url)
.ignoreContentType()
.data(postData)
.cookies(cookies)
.method(Method.POST)
.response();
cookies.putAll(resp.cookies());
Document nextDoc = resp.parse();
if (nextDoc.select("div.gif img").size() == 0) {
throw new IOException("No more images found");
}
sleep(2000);
return nextDoc;
}
@Override
public List<String> getURLsFromPage(Document doc) {
List<String> imageURLs = new ArrayList<String>();
for (Element image : doc.select("div.gif img")) {
String imageUrl = image.attr("src");
if (imageUrl.startsWith("//")) {
imageUrl = "http:" + imageUrl;
}
imageUrl = imageUrl.replace("/medium/", "/large/");
imageUrl = imageUrl.replace("_s.gif", ".gif");
imageURLs.add(imageUrl);
}
logger.info("Found " + imageURLs.size() + " images");
return imageURLs;
}
@Override
public void downloadURL(URL url, int index) {
addURLToDownload(url);
}
}

View File

@ -3,6 +3,8 @@ package com.rarchives.ripme.ripper.rippers;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -10,16 +12,14 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.rarchives.ripme.ripper.AlbumRipper;
import com.rarchives.ripme.ripper.AbstractSinglePageRipper;
import com.rarchives.ripme.ui.RipStatusMessage.STATUS;
import com.rarchives.ripme.utils.Http;
public class GirlsOfDesireRipper extends AlbumRipper {
public class GirlsOfDesireRipper extends AbstractSinglePageRipper {
// All sleep times are in milliseconds
private static final int IMAGE_SLEEP_TIME = 100;
private static final String DOMAIN = "girlsofdesire.org", HOST = "GirlsOfDesire";
// Current HTML document
private Document albumDoc = null;
@ -29,23 +29,19 @@ public class GirlsOfDesireRipper extends AlbumRipper {
@Override
public String getHost() {
return HOST;
return "GirlsOfDesire";
}
public URL sanitizeURL(URL url) throws MalformedURLException {
return url;
@Override
public String getDomain() {
return "girlsofdesire.org";
}
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
if (albumDoc == null) {
logger.info(" Retrieving " + url.toExternalForm());
sendUpdate(STATUS.LOADING_RESOURCE, url.toString());
albumDoc = Http.url(url).get();
}
Elements elems = albumDoc.select(".albumName");
return HOST + "_" + elems.first().text();
Document doc = getFirstPage();
Elements elems = doc.select(".albumName");
return getHost() + "_" + elems.first().text();
} catch (Exception e) {
// Fall back to default album naming convention
logger.warn("Failed to get album title from " + url, e);
@ -70,6 +66,33 @@ public class GirlsOfDesireRipper extends AlbumRipper {
+ " Got: " + url);
}
@Override
public Document getFirstPage() throws IOException {
if (albumDoc == null) {
albumDoc = Http.url(url).get();
}
return albumDoc;
}
@Override
public List<String> getURLsFromPage(Document doc) {
List<String> imageURLs = new ArrayList<String>();
for (Element thumb : doc.select("td.vtop > a > img")) {
String imgSrc = thumb.attr("src");
imgSrc = imgSrc.replaceAll("_thumb\\.", ".");
if (imgSrc.startsWith("/")) {
imgSrc = "http://www.girlsofdesire.org" + imgSrc;
}
imageURLs.add(imgSrc);
}
return imageURLs;
}
@Override
public void downloadURL(URL url, int index) {
addURLToDownload(url, getPrefix(index));
}
@Override
public void rip() throws IOException {
String nextUrl = this.url.toExternalForm();
@ -107,8 +130,4 @@ public class GirlsOfDesireRipper extends AlbumRipper {
waitForThreads();
}
public boolean canRip(URL url) {
return url.getHost().endsWith(DOMAIN);
}
}

View File

@ -3,21 +3,22 @@ package com.rarchives.ripme.ripper.rippers;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.json.JSONArray;
import org.json.JSONObject;
import com.rarchives.ripme.ripper.AlbumRipper;
import com.rarchives.ripme.ripper.AbstractJSONRipper;
import com.rarchives.ripme.utils.Http;
import com.rarchives.ripme.utils.Utils;
public class GonewildRipper extends AlbumRipper {
private static final String HOST = "gonewild";
private static final int SLEEP_TIME = 1000;
public class GonewildRipper extends AbstractJSONRipper {
private static final int count = 50;
private int startIndex = 0;
private static String API_DOMAIN;
private String username;
@ -26,6 +27,15 @@ public class GonewildRipper extends AlbumRipper {
API_DOMAIN = Utils.getConfigString("gw.api", "gonewild");
}
@Override
public String getHost() {
return "gonewild";
}
@Override
public String getDomain() {
return "gonewild.com";
}
@Override
public boolean canRip(URL url) {
return getUsernameMatcher(url).matches();
@ -36,72 +46,64 @@ public class GonewildRipper extends AlbumRipper {
return p.matcher(url.toExternalForm());
}
@Override
public URL sanitizeURL(URL url) throws MalformedURLException {
return url;
}
@Override
public void rip() throws IOException {
int start = 0,
count = 50;
String baseGwURL = "http://" + API_DOMAIN + ".rarchives.com/api.cgi"
+ "?method=get_user"
+ "&user=" + username
+ "&count=" + count;
String gwURL, imagePath;
JSONArray posts, images;
JSONObject json, post, image;
while (true) {
logger.info(" Retrieving posts by " + username);
gwURL = baseGwURL
+ "&start=" + start;
start += count;
json = Http.url(gwURL)
.getJSON();
if (json.has("error")) {
logger.error("Error while retrieving user posts:" + json.getString("error"));
break;
}
posts = json.getJSONArray("posts");
if (posts.length() == 0) {
break; // No more posts to get
}
for (int i = 0; i < posts.length(); i++) {
post = (JSONObject) posts.get(i);
images = post.getJSONArray("images");
for (int j = 0; j < images.length(); j++) {
image = (JSONObject) images.get(j);
imagePath = image.getString("path");
if (imagePath.startsWith("..")) {
imagePath = imagePath.substring(2);
}
imagePath = "http://" + API_DOMAIN + ".rarchives.com" + imagePath;
logger.info(" Found file: " + imagePath);
addURLToDownload(new URL(imagePath));
}
}
try {
Thread.sleep(SLEEP_TIME);
} catch (InterruptedException e) {
logger.error("[!] Interrupted while waiting to load more posts", e);
break;
}
}
waitForThreads();
}
@Override
public String getHost() {
return HOST;
}
@Override
public String getGID(URL url) throws MalformedURLException {
Matcher m = getUsernameMatcher(url);
if (m.matches()) {
this.username = m.group(m.groupCount());
}
else {
throw new MalformedURLException("Expected format: gonewild.com/<user>");
}
return username;
}
@Override
public JSONObject getFirstPage() throws IOException {
String gwURL = "http://" + API_DOMAIN + ".rarchives.com/api.cgi"
+ "?method=get_user"
+ "&user=" + username
+ "&count=" + count
+ "&start=" + startIndex;
JSONObject nextJSON = Http.url(gwURL).getJSON();
if (nextJSON.has("error")) {
throw new IOException(nextJSON.getString("error"));
}
if (nextJSON.getJSONArray("posts").length() == 0) {
throw new IOException("No posts found");
}
return nextJSON;
}
@Override
public JSONObject getNextPage(JSONObject json) throws IOException {
startIndex += count;
sleep(1000);
return getFirstPage();
}
@Override
public List<String> getURLsFromJSON(JSONObject json) {
List<String> imageURLs = new ArrayList<String>();
JSONArray posts = json.getJSONArray("posts");
for (int i = 0; i < posts.length(); i++) {
JSONObject post = posts.getJSONObject(i);
JSONArray images = post.getJSONArray("images");
for (int j = 0; j < images.length(); j++) {
JSONObject image = images.getJSONObject(j);
String imagePath = image.getString("path");
if (imagePath.startsWith("..")) {
imagePath = imagePath.substring(2);
}
imagePath = "http://" + API_DOMAIN + ".rarchives.com" + imagePath;
imageURLs.add(imagePath);
}
}
return imageURLs;
}
@Override
public void downloadURL(URL url, int index) {
addURLToDownload(url, getPrefix(index));
}
}