Added twitter support.
This commit is contained in:
parent
0fc42d844b
commit
4a47cc650e
@ -0,0 +1,282 @@
|
||||
package com.rarchives.ripme.ripper.rippers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONObject;
|
||||
import org.json.JSONTokener;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import com.rarchives.ripme.ripper.AbstractRipper;
|
||||
import com.rarchives.ripme.utils.Utils;
|
||||
|
||||
public class TwitterRipper extends AbstractRipper {
|
||||
|
||||
private static final String DOMAIN = "twitter.com",
|
||||
HOST = "twitter";
|
||||
private static final Logger logger = Logger.getLogger(TwitterRipper.class);
|
||||
|
||||
private static final int MAX_REQUESTS = 2;
|
||||
private static final int WAIT_TIME = 2000;
|
||||
|
||||
// Base 64 of consumer key : consumer secret
|
||||
private String authKey;
|
||||
private String accessToken;
|
||||
|
||||
private enum ALBUM_TYPE {
|
||||
ACCOUNT,
|
||||
SEARCH
|
||||
}
|
||||
private ALBUM_TYPE albumType;
|
||||
private String searchText, accountName;
|
||||
|
||||
public TwitterRipper(URL url) throws IOException {
|
||||
super(url);
|
||||
authKey = Utils.getConfigString("twitter.auth", null);
|
||||
if (authKey == null) {
|
||||
throw new IOException("Could not find twitter authentication key in configuration");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canRip(URL url) {
|
||||
return url.getHost().endsWith(DOMAIN);
|
||||
}
|
||||
|
||||
@Override
|
||||
public URL sanitizeURL(URL url) throws MalformedURLException {
|
||||
// https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd
|
||||
Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?q=([a-zA-Z0-9%]{1,}).*$");
|
||||
Matcher m = p.matcher(url.toExternalForm());
|
||||
if (m.matches()) {
|
||||
albumType = ALBUM_TYPE.SEARCH;
|
||||
searchText = m.group(2);
|
||||
return url;
|
||||
}
|
||||
p = Pattern.compile("^https?://(m\\.)?twitter\\.com/([a-zA-Z0-9]{1,}).*$");
|
||||
m = p.matcher(url.toExternalForm());
|
||||
if (m.matches()) {
|
||||
albumType = ALBUM_TYPE.ACCOUNT;
|
||||
accountName = m.group(2);
|
||||
return url;
|
||||
}
|
||||
throw new MalformedURLException("Expected username or search string in url: " + url);
|
||||
}
|
||||
|
||||
private void getAccessToken() throws IOException {
|
||||
Document doc = Jsoup.connect("https://api.twitter.com/oauth2/token")
|
||||
.ignoreContentType(true)
|
||||
.header("Authorization", "Basic " + authKey)
|
||||
.header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
|
||||
.header("User-agent", "ripe and zipe")
|
||||
.data("grant_type", "client_credentials")
|
||||
.post();
|
||||
String body = doc.body().html().replaceAll(""", "\"");
|
||||
try {
|
||||
JSONObject json = new JSONObject(body);
|
||||
accessToken = json.getString("access_token");
|
||||
return;
|
||||
} catch (JSONException e) {
|
||||
// Fall through
|
||||
throw new IOException("Failure while parsing JSON: " + body, e);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkRateLimits(String resource, String api) throws IOException {
|
||||
Document doc = Jsoup.connect("https://api.twitter.com/1.1/application/rate_limit_status.json?resources=" + resource)
|
||||
.ignoreContentType(true)
|
||||
.header("Authorization", "Bearer " + accessToken)
|
||||
.header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
|
||||
.header("User-agent", "ripe and zipe")
|
||||
.get();
|
||||
String body = doc.body().html().replaceAll(""", "\"");
|
||||
try {
|
||||
JSONObject json = new JSONObject(body);
|
||||
JSONObject stats = json.getJSONObject("resources")
|
||||
.getJSONObject(resource)
|
||||
.getJSONObject(api);
|
||||
int remaining = stats.getInt("remaining");
|
||||
logger.info(" Twitter " + resource + " calls remaining: " + remaining);
|
||||
if (remaining < 20) {
|
||||
logger.error("Twitter API calls exhausted: " + stats.toString());
|
||||
throw new IOException("Less than 20 API calls remaining; not enough to rip.");
|
||||
}
|
||||
} catch (JSONException e) {
|
||||
logger.error("JSONException: ", e);
|
||||
throw new IOException("Error while parsing JSON: " + body, e);
|
||||
}
|
||||
}
|
||||
|
||||
private String getApiURL(String maxID) {
|
||||
String req = "";
|
||||
switch (albumType) {
|
||||
case ACCOUNT:
|
||||
req = "https://api.twitter.com/1.1/statuses/user_timeline.json"
|
||||
+ "?screen_name=" + this.accountName
|
||||
+ "&include_entities=true"
|
||||
+ "&exclude_replies=true"
|
||||
+ "&trim_user=true"
|
||||
+ "&include_rts=false"
|
||||
+ "&count=" + 200;
|
||||
break;
|
||||
case SEARCH:
|
||||
req = "https://api.twitter.com/1.1/search/tweets.json"
|
||||
+ "?q=" + this.searchText
|
||||
+ "&include_entities=true"
|
||||
+ "&result_type=recent"
|
||||
+ "&count=100";
|
||||
break;
|
||||
}
|
||||
if (maxID != null) {
|
||||
req += "&max_id=" + maxID;
|
||||
}
|
||||
return req;
|
||||
}
|
||||
|
||||
private List<JSONObject> getTweets(String url) throws IOException {
|
||||
List<JSONObject> tweets = new ArrayList<JSONObject>();
|
||||
logger.info(" Retrieving " + url);
|
||||
Document doc = Jsoup.connect(url)
|
||||
.ignoreContentType(true)
|
||||
.header("Authorization", "Bearer " + accessToken)
|
||||
.header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
|
||||
.header("User-agent", "ripe and zipe")
|
||||
.get();
|
||||
String body = doc.body().html().replaceAll(""", "\"");
|
||||
Object jsonObj = new JSONTokener(body).nextValue();
|
||||
JSONArray statuses;
|
||||
if (jsonObj instanceof JSONObject) {
|
||||
JSONObject json = (JSONObject) jsonObj;
|
||||
if (json.has("errors")) {
|
||||
String msg = json.getJSONObject("errors").getString("message");
|
||||
throw new IOException("Twitter responded with errors: " + msg);
|
||||
}
|
||||
statuses = json.getJSONArray("statuses");
|
||||
} else {
|
||||
statuses = (JSONArray) jsonObj;
|
||||
}
|
||||
for (int i = 0; i < statuses.length(); i++) {
|
||||
tweets.add((JSONObject) statuses.get(i));
|
||||
}
|
||||
return tweets;
|
||||
}
|
||||
|
||||
private void parseTweet(JSONObject tweet) throws MalformedURLException {
|
||||
if (!tweet.has("entities")) {
|
||||
logger.error("XXX Tweet doesn't have entitites");
|
||||
return;
|
||||
}
|
||||
|
||||
JSONObject entities = tweet.getJSONObject("entities");
|
||||
|
||||
if (entities.has("media")) {
|
||||
JSONArray medias = entities.getJSONArray("media");
|
||||
String url;
|
||||
JSONObject media;
|
||||
for (int i = 0; i < medias.length(); i++) {
|
||||
media = (JSONObject) medias.get(i);
|
||||
url = media.getString("media_url");
|
||||
if (url.contains(".twimg.com/")) {
|
||||
url += ":large";
|
||||
}
|
||||
addURLToDownload(new URL(url));
|
||||
}
|
||||
}
|
||||
|
||||
if (entities.has("urls")) {
|
||||
JSONArray urls = entities.getJSONArray("urls");
|
||||
JSONObject url;
|
||||
for (int i = 0; i < urls.length(); i++) {
|
||||
url = (JSONObject) urls.get(i);
|
||||
if (url.get("expanded_url") != null) {
|
||||
handleTweetedURL(url.getString("url"));
|
||||
} else {
|
||||
handleTweetedURL(url.getString("expanded_url"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void handleTweetedURL(String url) {
|
||||
logger.error("[!] Need to handle URL: " + url);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void rip() throws IOException {
|
||||
getAccessToken();
|
||||
|
||||
switch (albumType) {
|
||||
case ACCOUNT:
|
||||
checkRateLimits("statuses", "/statuses/user_timeline");
|
||||
break;
|
||||
case SEARCH:
|
||||
checkRateLimits("search", "/search/tweets");
|
||||
break;
|
||||
}
|
||||
|
||||
String maxID = null;
|
||||
for (int i = 0; i < MAX_REQUESTS; i++) {
|
||||
List<JSONObject> tweets = getTweets(getApiURL(maxID));
|
||||
if (tweets.size() == 0) {
|
||||
logger.info(" No more tweets found.");
|
||||
break;
|
||||
}
|
||||
for (JSONObject tweet : tweets) {
|
||||
maxID = tweet.getString("id_str");
|
||||
parseTweet(tweet);
|
||||
}
|
||||
|
||||
try {
|
||||
Thread.sleep(WAIT_TIME);
|
||||
} catch (InterruptedException e) {
|
||||
logger.error("[!] Interrupted while waiting to load more results", e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
waitForThreads();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getHost() {
|
||||
return HOST;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getGID(URL url) throws MalformedURLException {
|
||||
switch (albumType) {
|
||||
case ACCOUNT:
|
||||
return "account_" + accountName;
|
||||
case SEARCH:
|
||||
StringBuilder gid = new StringBuilder();
|
||||
for (int i = 0; i < searchText.length(); i++) {
|
||||
char c = searchText.charAt(i);
|
||||
// Ignore URL-encoded chars
|
||||
if (c == '%') {
|
||||
gid.append('_');
|
||||
i += 2;
|
||||
continue;
|
||||
// Ignore non-alphanumeric chars
|
||||
} else if (
|
||||
(c >= 'a' && c <= 'z')
|
||||
|| (c >= 'A' && c <= 'Z')
|
||||
|| (c >= '0' && c <= '9')
|
||||
) {
|
||||
gid.append(c);
|
||||
}
|
||||
}
|
||||
return "search_" + gid.toString();
|
||||
}
|
||||
throw new MalformedURLException("Could not decide type of URL (search/account): " + url);
|
||||
}
|
||||
|
||||
}
|
@ -1,3 +1,4 @@
|
||||
threads.size = 5
|
||||
file.overwrite = false
|
||||
download.retries = 3
|
||||
twitter.auth = VW9Ybjdjb1pkd2J0U3kwTUh2VXVnOm9GTzVQVzNqM29LQU1xVGhnS3pFZzhKbGVqbXU0c2lHQ3JrUFNNZm8=
|
@ -0,0 +1,28 @@
|
||||
package com.rarchives.ripme.tst.ripper.rippers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.rarchives.ripme.ripper.rippers.TwitterRipper;
|
||||
|
||||
public class TwitterRipperTest extends RippersTest {
|
||||
|
||||
public void testTwitterAlbums() throws IOException {
|
||||
List<URL> contentURLs = new ArrayList<URL>();
|
||||
//contentURLs.add(new URL("https://twitter.com/danngamber01/media"));
|
||||
contentURLs.add(new URL("https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd"));
|
||||
for (URL url : contentURLs) {
|
||||
try {
|
||||
TwitterRipper ripper = new TwitterRipper(url);
|
||||
ripper.rip();
|
||||
assert(ripper.getWorkingDir().listFiles().length > 1);
|
||||
deleteDir(ripper.getWorkingDir());
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
fail("Error while ripping URL " + url + ": " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user