Added threaded download manager, imgur ripper works

This commit is contained in:
4pr0n 2014-02-26 19:54:44 -08:00
parent 582ecd8ae8
commit 46e2948403
10 changed files with 397 additions and 81 deletions

17
config/log4j.properties Normal file
View File

@ -0,0 +1,17 @@
# define the file appender
log4j.appender.FILE = org.apache.log4j.RollingFileAppender
log4j.appender.FILE.File = ripme.log
log4j.appender.FILE.ImmediateFlush = true
log4j.appender.FILE.Threshold = debug
log4j.appender.FILE.maxFileSize = 20MB
log4j.appender.FILE.layout = org.apache.log4j.PatternLayout
log4j.appender.FILE.layout.ConversionPattern = %d %-4r [%t] %-5p %c{2} %x - %m%n
# define the console appender
log4j.appender.stdout = org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target = System.out
log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern = %d %-4r [%t] %-5p %c{2} %x - %m%n
# now map our console appender as a root logger, means all log messages will go to this appender
log4j.rootLogger = DEBUG, FILE, stdout

1
config/rip.properties Normal file
View File

@ -0,0 +1 @@
threads.size = 5

View File

@ -1,26 +1,27 @@
package com.rarchives.ripme;
import java.io.IOException;
import java.net.URL;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import com.rarchives.ripme.ripper.rippers.ImagefapRipper;
import com.rarchives.ripme.ripper.rippers.ImgurRipper;
/**
*
*/
public class App {
public static void main( String[] args ) throws IOException {
public static void main( String[] args ) throws Exception {
Logger logger = Logger.getLogger(App.class);
PropertyConfigurator.configure("config/log4j.properties");
logger.debug("Testing");
URL url = new URL("http://www.imagefap.com/pictures/4117023/Mirror-flat-stomach-small-firm-tits");
System.out.println("URL: " + url.toExternalForm());
ImagefapRipper ir = new ImagefapRipper(url);
System.out.println("Ripping");
ir.rip();
logger.debug("Initialized");
//URL url = new URL("http://www.imagefap.com/pictures/4117023/Mirror-flat-stomach-small-firm-tits");
URL url = new URL("http://imgur.com/a/Ox6jN");
try {
ImgurRipper ir = new ImgurRipper(url);
ir.rip();
} catch (Exception e) {
logger.error("Caught exception:", e);
throw e;
}
}
public static void initialize() {

View File

@ -5,18 +5,26 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.log4j.Logger;
import com.rarchives.ripme.utils.Utils;
public abstract class AbstractRipper implements RipperInterface {
private static final Logger logger = Logger.getLogger(AbstractRipper.class);
protected URL url;
protected File workingDir = null;
protected File workingDir;
protected DownloadThreadPool threadPool;
public abstract void rip() throws IOException;
public abstract void setWorkingDir() throws IOException;
public abstract String getHost();
public abstract String getGID(URL url) throws MalformedURLException;
/**
* Ensures inheriting ripper can rip this URL.
* Ensures inheriting ripper can rip this URL, raises exception if not.
* Otherwise initializes working directory and thread pool.
*
* @param url
* URL to rip.
* @throws IOException
@ -26,13 +34,57 @@ public abstract class AbstractRipper implements RipperInterface {
if (!canRip(url)) {
throw new MalformedURLException("Unable to rip url: " + url);
}
this.url = url;
setWorkingDir();
workingDir = Utils.getWorkingDirectory();
this.url = sanitizeURL(url);
setWorkingDir(url);
this.threadPool = new DownloadThreadPool();
}
public void addURLToDownload(URL url) {
addURLToDownload(url, "");
}
public void addURLToDownload(URL url, String prefix) {
String saveAs = url.toExternalForm();
saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1);
if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); }
if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); }
if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); }
File saveFileAs;
try {
saveFileAs = new File(workingDir.getCanonicalPath() + File.separator + prefix + saveAs);
} catch (IOException e) {
logger.error("Error creating save file path for URL '" + url + "':", e);
return;
}
logger.info("Downloading " + url + " to " + saveFileAs);
addURLToDownload(url, saveFileAs);
}
/**
* Add image to be downloaded and saved.
* @param url
* URL of the file
* @param saveAs
* Path of the local file to save the content to.
*/
public void addURLToDownload(URL url, File saveAs) {
threadPool.addThread(new DownloadFileThread(url, saveAs));
}
public URL getURL() {
return url;
}
public void setWorkingDir(URL url) throws IOException {
String path = Utils.getWorkingDirectory().getCanonicalPath();
if (!path.endsWith(File.separator)) {
path += File.separator;
}
path += getHost() + "_" + getGID(this.url) + File.separator;
this.workingDir = new File(path);
if (!this.workingDir.exists()) {
logger.info("Creating working directory(s): " + this.workingDir);
this.workingDir.mkdirs();
}
logger.debug("Set working directory to: " + this.workingDir);
}
}

View File

@ -0,0 +1,55 @@
package com.rarchives.ripme.ripper;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import org.apache.log4j.Logger;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import com.rarchives.ripme.utils.Utils;
public class DownloadFileThread extends Thread {
private static final Logger logger = Logger.getLogger(DownloadFileThread.class);
private URL url;
private File saveAs;
public DownloadFileThread(URL url, File saveAs) {
super();
this.url = url;
this.saveAs = saveAs;
}
public void run() {
// Check if file already exists
if (saveAs.exists()) {
if (Utils.getConfigBoolean("file.overwrite", false)) {
logger.info("File already exists and 'file.overwrite' is true, deleting: " + saveAs);
saveAs.delete();
} else {
logger.info("Not downloading " + url + " because file already exists: " + saveAs);
return;
}
}
logger.debug("Downloading file from: " + url);
try {
Response response;
response = Jsoup.connect(url.toExternalForm())
.ignoreContentType(true)
.execute();
FileOutputStream out = (new FileOutputStream(saveAs));
out.write(response.bodyAsBytes());
out.close();
} catch (IOException e) {
logger.error("Exception while downloading file: " + url, e);
return;
}
logger.debug("Download completed: " + url);
}
}

View File

@ -0,0 +1,34 @@
package com.rarchives.ripme.ripper;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import org.apache.log4j.Logger;
import com.rarchives.ripme.utils.Utils;
public class DownloadThreadPool {
private static final Logger logger = Logger.getLogger(DownloadThreadPool.class);
private ExecutorService threadPool = null;
public DownloadThreadPool() {
int threads = Utils.getConfigInteger("threads.size", 10);
logger.debug("Initializing thread pool with " + threads + " threads");
threadPool = Executors.newFixedThreadPool(threads);
}
public void addThread(Thread t) {
threadPool.execute(t);
}
public void waitForThreads() {
threadPool.shutdown();
try {
threadPool.awaitTermination(60, TimeUnit.SECONDS);
} catch (InterruptedException e) {
logger.error("Interrupted while waiting for threads to finish: ", e);
}
}
}

View File

@ -1,11 +1,14 @@
package com.rarchives.ripme.ripper;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
public interface RipperInterface {
public void rip() throws IOException;
public void processURL(String url);
public boolean canRip(URL url);
public void setWorkingDir() throws IOException;
public URL sanitizeURL(URL url) throws MalformedURLException;
public void setWorkingDir(URL url) throws IOException;
public String getHost();
public String getGID(URL url) throws MalformedURLException;
}

View File

@ -1,92 +1,88 @@
package com.rarchives.ripme.ripper.rippers;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import com.rarchives.ripme.ripper.AbstractRipper;
import com.rarchives.ripme.utils.Utils;
public class ImagefapRipper extends AbstractRipper {
private static final String HOST = "imagefap.com";
private String gid;
private static final String DOMAIN = "imagefap.com",
HOST = "imagefap";
private static final Logger logger = Logger.getLogger(ImagefapRipper.class);
public ImagefapRipper(URL url) throws IOException {
super(url);
this.gid = getGID(url);
}
@Override
public String getHost() {
return HOST;
}
/**
* Reformat given URL into the desired format (all images on single page)
*/
public void sanitizeURL() throws MalformedURLException {
this.url = new URL("http://www.imagefap.com/gallery.php?gid="
+ this.gid + "&view=2");
public URL sanitizeURL(URL url) throws MalformedURLException {
String gid = getGID(url);
logger.debug("GID=" + gid);
URL newURL = new URL("http://www.imagefap.com/gallery.php?gid="
+ gid + "&view=2");
logger.debug("Sanitized URL from " + url + " to " + newURL);
return newURL;
}
private static String getGID(URL url) throws MalformedURLException {
String gid = null;
Pattern p = Pattern.compile("^.*imagefap.com/gallery.php?gid=([0-9]{1,}).*$");
public String getGID(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^.*imagefap.com/gallery.php\\?gid=([0-9]{1,}).*$");
Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) {
gid = m.group(1);
} else {
p = Pattern.compile("^.*imagefap.com/pictures/([0-9]{1,}).*$");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
gid = m.group(1);
}
return m.group(1);
}
if (gid == null) {
throw new MalformedURLException(
"Expected imagefap.com gallery formats:"
+ "imagefap.com/gallery.php?gid=####... or"
+ "imagefap.com/pictures/####...");
p = Pattern.compile("^.*imagefap.com/pictures/([0-9]{1,}).*$");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
return m.group(1);
}
return gid;
}
@Override
public void setWorkingDir() throws IOException {
String path = Utils.getWorkingDirectory().getCanonicalPath();
path += this.gid + File.separator;
this.workingDir = new File(path);
throw new MalformedURLException(
"Expected imagefap.com gallery formats: "
+ "imagefap.com/gallery.php?gid=####... or "
+ "imagefap.com/pictures/####..."
+ " Got: " + url);
}
@Override
public void rip() throws IOException {
System.err.println("Connecting to " + this.url.toExternalForm());
logger.debug("Retrieving " + this.url.toExternalForm());
Document doc = Jsoup.connect(this.url.toExternalForm()).get();
for (Element thumb : doc.select("#gallery img")) {
if (!thumb.hasAttr("src") || !thumb.hasAttr("width")) {
continue;
}
String image = thumb.attr("src");
image = image.replaceAll("http://x.*.fap.to/images/thumb/",
image = image.replaceAll(
"http://x.*.fap.to/images/thumb/",
"http://fap.to/images/full/");
processURL(image);
System.err.println(image);
processURL(new URL(image));
}
}
public void processURL(String url) {
public void processURL(URL url) {
logger.info("Found " + url);
}
public boolean canRip(URL url) {
if (!url.getHost().endsWith(HOST)) {
if (!url.getHost().endsWith(DOMAIN)) {
return false;
}
return true;
}
}
}

View File

@ -0,0 +1,142 @@
package com.rarchives.ripme.ripper.rippers;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import com.rarchives.ripme.ripper.AbstractRipper;
public class ImgurRipper extends AbstractRipper {
private static final String DOMAIN = "imgur.com",
HOST = "imgur";
private static final Logger logger = Logger.getLogger(ImgurRipper.class);
static enum ALBUM_TYPE {
ALBUM,
USER,
USER_ALBUM,
SERIES_OF_IMAGES
};
private ALBUM_TYPE albumType;
public ImgurRipper(URL url) throws IOException {
super(url);
}
public void processURL(URL url, String prefix) {
logger.info("Found URL: " + url);
addURLToDownload(url, prefix);
}
public boolean canRip(URL url) {
if (!url.getHost().endsWith(DOMAIN)) {
return false;
}
try {
getGID(url);
} catch (Exception e) {
// Can't get GID, can't rip it.
return false;
}
return true;
}
public URL sanitizeURL(URL url) throws MalformedURLException {
String u = url.toExternalForm();
if (u.indexOf('#') >= 0) {
u = u.substring(0, u.indexOf('#'));
}
return new URL(u);
}
@Override
public void rip() throws IOException {
switch (albumType) {
case ALBUM:
this.url = new URL(this.url.toExternalForm() + "/noscript");
// Fall-through
case USER_ALBUM:
ripAlbum(this.url);
break;
case SERIES_OF_IMAGES:
// TODO Get all images
break;
case USER:
// TODO Get all albums by user
break;
}
threadPool.waitForThreads();
}
private void ripAlbum(URL url) throws IOException {
int index = 0;
logger.debug("Retrieving " + url.toExternalForm());
Document doc = Jsoup.connect(url.toExternalForm()).get();
for (Element thumb : doc.select("div.image")) {
String image;
if (thumb.select("a.zoom").size() > 0) {
// Clickably full-size
image = "http:" + thumb.select("a").attr("href");
} else if (thumb.select("img").size() > 0) {
image = "http:" + thumb.select("img").attr("src");
} else {
// Unable to find image in this div
logger.error("Unable to find image in div: " + thumb.toString());
continue;
}
index += 1;
processURL(new URL(image), String.format("%03d_", index));
}
}
@Override
public String getHost() {
return HOST;
}
@Override
public String getGID(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://(m\\.)?imgur\\.com/a/([a-zA-Z0-9]{5,8}).*$");
Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) {
// Imgur album
albumType = ALBUM_TYPE.ALBUM;
String gid = m.group(m.groupCount());
this.url = new URL("http://imgur.com/a/" + gid);
return gid;
}
p = Pattern.compile("^https?://([a-zA-Z0-9\\-])\\.imgur\\.com/?$");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
// Root imgur account
albumType = ALBUM_TYPE.USER;
return m.group(m.groupCount());
}
p = Pattern.compile("^https?://([a-zA-Z0-9\\-])\\.imgur\\.com/([a-zA-Z0-9])?$");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
// Imgur account album
albumType = ALBUM_TYPE.USER_ALBUM;
return m.group();
}
p = Pattern.compile("^https?://(i\\.)?imgur\\.com/([a-zA-Z0-9,]{5,}).*$");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
// Series of imgur images
albumType = ALBUM_TYPE.SERIES_OF_IMAGES;
return m.group();
}
throw new MalformedURLException("Unexpected URL format: " + url.toExternalForm());
}
}

View File

@ -1,18 +1,17 @@
package com.rarchives.ripme.utils;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import org.apache.commons.configuration.Configuration;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.apache.log4j.Logger;
public class Utils {
public static final String RIP_DIRECTORY = "rips";
private static final Logger logger = Logger.getLogger(Utils.class);
public static File getWorkingDirectory() throws IOException {
String path = new File(".").getCanonicalPath() + File.separator;
@ -23,25 +22,41 @@ public class Utils {
}
return workingDir;
}
public static String getConfigString(String key) {
Configuration config = null;
try {
config = new PropertiesConfiguration("rip.properties");
} catch (ConfigurationException e) {
System.err.println(e);
return null;
}
return config.getString(key);
}
public static void downloadFile(String url, File saveAs) throws IOException {
Response response = Jsoup.connect(url)
.ignoreContentType(true)
.execute();
FileOutputStream out = (new FileOutputStream(saveAs));
out.write(response.bodyAsBytes());
out.close();
public static String getConfigString(String key, String defaultValue) {
String value = defaultValue;
try {
Configuration config = new PropertiesConfiguration("config/rip.properties");
value = config.getString(key);
} catch (ConfigurationException e) {
logger.error("Failed to get configuration value for " + key
+ ", using default '" + value + "'");
}
return value;
}
public static int getConfigInteger(String key, int defaultValue) {
int value = defaultValue;
try {
Configuration config = new PropertiesConfiguration(new File("./config/rip.properties"));
value = config.getInt(key, defaultValue);
} catch (Exception e) {
logger.error("Failed to get configuration value for " + key
+ ", using default '" + value + "'");
}
return value;
}
public static boolean getConfigBoolean(String key, boolean defaultValue) {
boolean value = defaultValue;
try {
Configuration config = new PropertiesConfiguration(new File("./config/rip.properties"));
value = config.getBoolean(key, defaultValue);
} catch (Exception e) {
logger.error("Failed to get configuration value for " + key
+ ", using default '" + value + "'");
}
return value;
}
}