Download via stream instead of Jsoup for #80

This commit is contained in:
4pr0n 2014-06-25 01:14:05 -07:00
parent f8f3067099
commit 104cbead5a

View File

@ -1,15 +1,18 @@
package com.rarchives.ripme.ripper; package com.rarchives.ripme.ripper;
import java.io.BufferedInputStream;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL; import java.net.URL;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.HttpStatusException; import org.jsoup.HttpStatusException;
import com.rarchives.ripme.ui.RipStatusMessage.STATUS; import com.rarchives.ripme.ui.RipStatusMessage.STATUS;
@ -33,7 +36,6 @@ public class DownloadFileThread extends Thread {
private int retries; private int retries;
private final int TIMEOUT; private final int TIMEOUT;
private final int MAX_BODY_SIZE;
public DownloadFileThread(URL url, File saveAs, AbstractRipper observer) { public DownloadFileThread(URL url, File saveAs, AbstractRipper observer) {
super(); super();
@ -43,7 +45,6 @@ public class DownloadFileThread extends Thread {
this.observer = observer; this.observer = observer;
this.retries = Utils.getConfigInteger("download.retries", 1); this.retries = Utils.getConfigInteger("download.retries", 1);
this.TIMEOUT = Utils.getConfigInteger("download.timeout", 60000); this.TIMEOUT = Utils.getConfigInteger("download.timeout", 60000);
this.MAX_BODY_SIZE = Utils.getConfigInteger("download.max_bytes", 1024 * 1024 * 100);
} }
public void setReferrer(String referrer) { public void setReferrer(String referrer) {
@ -77,44 +78,67 @@ public class DownloadFileThread extends Thread {
int tries = 0; // Number of attempts to download int tries = 0; // Number of attempts to download
do { do {
tries += 1;
InputStream bis = null; OutputStream fos = null;
try { try {
logger.info(" Downloading file: " + url + (tries > 0 ? " Retry #" + tries : "")); logger.info(" Downloading file: " + url + (tries > 0 ? " Retry #" + tries : ""));
observer.sendUpdate(STATUS.DOWNLOAD_STARTED, url.toExternalForm()); observer.sendUpdate(STATUS.DOWNLOAD_STARTED, url.toExternalForm());
tries += 1;
Response response; // Setup HTTP request
response = Jsoup.connect(url.toExternalForm()) HttpURLConnection huc = (HttpURLConnection) this.url.openConnection();
.ignoreContentType(true) huc.setConnectTimeout(TIMEOUT);
.userAgent(AbstractRipper.USER_AGENT) huc.setRequestProperty("accept", "*/*");
.header("accept", "*/*") huc.setRequestProperty("Referer", referrer); // Sic
.timeout(TIMEOUT) huc.setRequestProperty("User-agent", AbstractRipper.USER_AGENT);
.maxBodySize(MAX_BODY_SIZE) String cookie = "";
.cookies(cookies) for (String key : cookies.keySet()) {
.referrer(referrer) if (!cookie.equals("")) {
.execute(); cookie += "; ";
if (response.statusCode() != 200) { }
logger.error("[!] Non-OK status code " + response.statusCode() + " while downloading from " + url); cookie += key + "=" + cookies.get(key);
observer.downloadErrored(url, "Non-OK status code " + response.statusCode() + " while downloading " + url.toExternalForm());
return;
} }
byte[] bytes = response.bodyAsBytes(); huc.setRequestProperty("Cookie", cookie);
if (bytes.length == 503 && url.getHost().endsWith("imgur.com")) { huc.connect();
int statusCode = huc.getResponseCode();
if (statusCode / 100 == 4) { // 4xx errors
logger.error("[!] Non-retriable status code " + statusCode + " while downloading from " + url);
observer.downloadErrored(url, "Non-retriable status code " + statusCode + " while downloading " + url.toExternalForm());
return; // Not retriable, drop out.
}
if (statusCode / 100 == 5) { // 5xx errors
observer.downloadErrored(url, "Retriable status code " + statusCode + " while downloading " + url.toExternalForm());
// Throw exception so download can be retried
throw new IOException("Retriable status code " + statusCode);
}
if (huc.getContentLength() == 503 && url.getHost().endsWith("imgur.com")) {
// Imgur image with 503 bytes is "404" // Imgur image with 503 bytes is "404"
logger.error("[!] Imgur image is 404 (503 bytes long): " + url); logger.error("[!] Imgur image is 404 (503 bytes long): " + url);
observer.downloadErrored(url, "Imgur image is 404: " + url.toExternalForm()); observer.downloadErrored(url, "Imgur image is 404: " + url.toExternalForm());
return; return;
} }
FileOutputStream out = new FileOutputStream(saveAs);
out.write(response.bodyAsBytes()); // Save file
out.close(); bis = new BufferedInputStream(huc.getInputStream());
fos = new FileOutputStream(saveAs);
IOUtils.copy(bis, fos);
break; // Download successful: break out of infinite loop break; // Download successful: break out of infinite loop
} catch (HttpStatusException hse) { } catch (HttpStatusException hse) {
logger.error("[!] HTTP status " + hse.getStatusCode() + " while downloading from " + url); logger.error("[!] HTTP status " + hse.getStatusCode() + " while downloading from " + url);
observer.downloadErrored(url, "HTTP status code " + hse.getStatusCode() + " while downloading " + url.toExternalForm());
if (hse.getStatusCode() == 404 && Utils.getConfigBoolean("errors.skip404", false)) { if (hse.getStatusCode() == 404 && Utils.getConfigBoolean("errors.skip404", false)) {
observer.downloadErrored(url, "HTTP status code " + hse.getStatusCode() + " while downloading " + url.toExternalForm());
return; return;
} }
} catch (IOException e) { } catch (IOException e) {
logger.error("[!] Exception while downloading file: " + url + " - " + e.getMessage(), e); logger.error("[!] Exception while downloading file: " + url + " - " + e.getMessage(), e);
} finally {
// Close any open streams
try {
if (bis != null) { bis.close(); }
} catch (IOException e) { }
try {
if (fos != null) { fos.close(); }
} catch (IOException e) { }
} }
if (tries > this.retries) { if (tries > this.retries) {
logger.error("[!] Exceeded maximum retries (" + this.retries + ") for URL " + url); logger.error("[!] Exceeded maximum retries (" + this.retries + ") for URL " + url);