Flickr ripper iterates over pages #8

This commit is contained in:
4pr0n 2014-04-24 20:49:18 -07:00
parent 14976f3169
commit b8974cac47

View File

@ -3,12 +3,16 @@ package com.rarchives.ripme.ripper.rippers;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
@ -16,6 +20,7 @@ import org.jsoup.select.Elements;
import com.rarchives.ripme.ripper.AlbumRipper; import com.rarchives.ripme.ripper.AlbumRipper;
import com.rarchives.ripme.ripper.DownloadThreadPool; import com.rarchives.ripme.ripper.DownloadThreadPool;
import com.rarchives.ripme.utils.Base64;
import com.rarchives.ripme.utils.Utils; import com.rarchives.ripme.utils.Utils;
public class FlickrRipper extends AlbumRipper { public class FlickrRipper extends AlbumRipper {
@ -94,39 +99,72 @@ public class FlickrRipper extends AlbumRipper {
@Override @Override
public void rip() throws IOException { public void rip() throws IOException {
//Map<String,String> cookies = signinToFlickr();
Set<String> attempted = new HashSet<String>(); Set<String> attempted = new HashSet<String>();
int index = 0; int index = 0, page = 1;
logger.info(" Retrieving " + this.url.toExternalForm()); String nextURL = this.url.toExternalForm();
if (albumDoc == null) { while (true) {
albumDoc = Jsoup.connect(this.url.toExternalForm()).get(); if (isStopped()) {
} break;
for (Element thumb : albumDoc.select("a[data-track=photo-click]")) {
String imageTitle = null;
if (thumb.hasAttr("title")) {
imageTitle = thumb.attr("title");
} }
String imagePage = thumb.attr("href"); logger.info(" Retrieving " + nextURL);
if (imagePage.startsWith("/")) { if (albumDoc == null) {
imagePage = "http://www.flickr.com" + imagePage; albumDoc = Jsoup.connect(nextURL)
.get();
} }
if (imagePage.contains("/in/")) { for (Element thumb : albumDoc.select("a[data-track=photo-click]")) {
imagePage = imagePage.substring(0, imagePage.indexOf("/in/") + 1); String imageTitle = null;
} if (thumb.hasAttr("title")) {
if (!imagePage.endsWith("/")) { imageTitle = thumb.attr("title");
imagePage += "/"; }
} String imagePage = thumb.attr("href");
imagePage += "sizes/o/"; if (imagePage.startsWith("/")) {
imagePage = "http://www.flickr.com" + imagePage;
}
if (imagePage.contains("/in/")) {
imagePage = imagePage.substring(0, imagePage.indexOf("/in/") + 1);
}
if (!imagePage.endsWith("/")) {
imagePage += "/";
}
imagePage += "sizes/o/";
// Check for duplicates // Check for duplicates
if (attempted.contains(imagePage)) { if (attempted.contains(imagePage)) {
continue; continue;
} }
attempted.add(imagePage); attempted.add(imagePage);
index += 1; index += 1;
// Add image page to threadpool to grab the image & download it // Add image page to threadpool to grab the image & download it
FlickrImageThread mit = new FlickrImageThread(new URL(imagePage), imageTitle, index); FlickrImageThread mit = new FlickrImageThread(new URL(imagePage), imageTitle, index);
flickrThreadPool.addThread(mit); flickrThreadPool.addThread(mit);
}
// Find how many pages there are
int lastPage = 0;
for (Element apage : albumDoc.select("a[data-track^=page-]")) {
String lastPageStr = apage.attr("data-track").replace("page-", "");
lastPage = Integer.parseInt(lastPageStr);
}
// If we're at the last page, stop.
if (page >= lastPage) {
break;
}
// Load the next page
page++;
albumDoc = null;
nextURL = this.url.toExternalForm();
if (!nextURL.endsWith("/")) {
nextURL += "/";
}
nextURL += "page" + page + "/";
// Wait a bit
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
logger.error("Interrupted while waiting to load next page " + nextURL, e);
break;
}
} }
flickrThreadPool.waitForThreads(); flickrThreadPool.waitForThreads();
waitForThreads(); waitForThreads();
@ -136,6 +174,36 @@ public class FlickrRipper extends AlbumRipper {
return url.getHost().endsWith(DOMAIN); return url.getHost().endsWith(DOMAIN);
} }
/**
* Login to Flickr.
* @return Cookies for logged-in session
* @throws IOException
*/
@SuppressWarnings("unused")
private Map<String,String> signinToFlickr() throws IOException {
Response resp = Jsoup.connect("http://www.flickr.com/signin/")
.userAgent(USER_AGENT)
.followRedirects(true)
.method(Method.GET)
.execute();
Document doc = resp.parse();
Map<String,String> postData = new HashMap<String,String>();
for (Element input : doc.select("input[type=hidden]")) {
postData.put(input.attr("name"), input.attr("value"));
}
postData.put("passwd_raw", "");
postData.put(".save", "");
postData.put("login", new String(Base64.decode("bGVmYWtlZGVmYWtl")));
postData.put("passwd", new String(Base64.decode("MUZha2V5ZmFrZQ==")));
String action = doc.select("form[method=post]").get(0).attr("action");
resp = Jsoup.connect(action)
.cookies(resp.cookies())
.data(postData)
.method(Method.POST)
.execute();
return resp.cookies();
}
/** /**
* Helper class to find and download images found on "image" pages * Helper class to find and download images found on "image" pages
*/ */