Added twitter support.

2014-03-05 04:56:13 -08:00 · 2014-03-05 04:56:13 -08:00 · 4a47cc650e
commit 4a47cc650e
parent 0fc42d844b
3 changed files with 312 additions and 1 deletions
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java
@ -0,0 +1,282 @@
+package com.rarchives.ripme.ripper.rippers;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+import org.json.JSONArray;
+import org.json.JSONException;
+import org.json.JSONObject;
+import org.json.JSONTokener;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
+import com.rarchives.ripme.ripper.AbstractRipper;
+import com.rarchives.ripme.utils.Utils;
+
+public class TwitterRipper extends AbstractRipper {
+
+    private static final String DOMAIN = "twitter.com",
+                                HOST   = "twitter";
+    private static final Logger logger = Logger.getLogger(TwitterRipper.class);
+
+    private static final int MAX_REQUESTS = 2;
+    private static final int WAIT_TIME = 2000;
+
+    // Base 64 of consumer key : consumer secret
+    private String authKey;
+    private String accessToken;
+
+    private enum ALBUM_TYPE {
+        ACCOUNT,
+        SEARCH
+    }
+    private ALBUM_TYPE albumType;
+    private String searchText, accountName;
+    
+    public TwitterRipper(URL url) throws IOException {
+        super(url);
+        authKey = Utils.getConfigString("twitter.auth", null);
+        if (authKey == null) {
+            throw new IOException("Could not find twitter authentication key in configuration");
+        }
+    }
+
+    @Override
+    public boolean canRip(URL url) {
+        return url.getHost().endsWith(DOMAIN);
+    }
+
+    @Override
+    public URL sanitizeURL(URL url) throws MalformedURLException {
+        // https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd
+        Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?q=([a-zA-Z0-9%]{1,}).*$");
+        Matcher m = p.matcher(url.toExternalForm());
+        if (m.matches()) {
+            albumType = ALBUM_TYPE.SEARCH;
+            searchText = m.group(2);
+            return url;
+        }
+        p = Pattern.compile("^https?://(m\\.)?twitter\\.com/([a-zA-Z0-9]{1,}).*$");
+        m = p.matcher(url.toExternalForm());
+        if (m.matches()) {
+            albumType = ALBUM_TYPE.ACCOUNT;
+            accountName = m.group(2);
+            return url;
+        }
+        throw new MalformedURLException("Expected username or search string in url: " + url);
+    }
+    
+    private void getAccessToken() throws IOException {
+        Document doc = Jsoup.connect("https://api.twitter.com/oauth2/token")
+                            .ignoreContentType(true)
+                            .header("Authorization", "Basic " + authKey)
+                            .header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
+                            .header("User-agent", "ripe and zipe")
+                            .data("grant_type", "client_credentials")
+                            .post();
+        String body = doc.body().html().replaceAll("&quot;", "\"");
+        try {
+            JSONObject json = new JSONObject(body);
+            accessToken = json.getString("access_token");
+            return;
+        } catch (JSONException e) {
+            // Fall through
+            throw new IOException("Failure while parsing JSON: " + body, e);
+        }
+    }
+    
+    private void checkRateLimits(String resource, String api) throws IOException {
+        Document doc = Jsoup.connect("https://api.twitter.com/1.1/application/rate_limit_status.json?resources=" + resource)
+                            .ignoreContentType(true)
+                            .header("Authorization", "Bearer " + accessToken)
+                            .header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
+                            .header("User-agent", "ripe and zipe")
+                            .get();
+        String body = doc.body().html().replaceAll("&quot;", "\"");
+        try {
+            JSONObject json = new JSONObject(body);
+            JSONObject stats = json.getJSONObject("resources")
+                                   .getJSONObject(resource)
+                                   .getJSONObject(api);
+            int remaining = stats.getInt("remaining");
+            logger.info("    Twitter " + resource + " calls remaining: " + remaining);
+            if (remaining < 20) {
+                logger.error("Twitter API calls exhausted: " + stats.toString());
+                throw new IOException("Less than 20 API calls remaining; not enough to rip.");
+            }
+        } catch (JSONException e) {
+            logger.error("JSONException: ", e);
+            throw new IOException("Error while parsing JSON: " + body, e);
+        }
+    }
+    
+    private String getApiURL(String maxID) {
+        String req = "";
+        switch (albumType) {
+        case ACCOUNT:
+            req = "https://api.twitter.com/1.1/statuses/user_timeline.json"
+                + "?screen_name=" + this.accountName
+                + "&include_entities=true"
+                + "&exclude_replies=true"
+                + "&trim_user=true"
+                + "&include_rts=false"
+                + "&count=" + 200;
+            break;
+        case SEARCH:
+            req = "https://api.twitter.com/1.1/search/tweets.json"
+                + "?q=" + this.searchText
+                + "&include_entities=true"
+                + "&result_type=recent"
+                + "&count=100";
+            break;
+        }
+        if (maxID != null) {
+            req += "&max_id=" + maxID;
+        }
+        return req;
+    }
+    
+    private List<JSONObject> getTweets(String url) throws IOException {
+        List<JSONObject> tweets = new ArrayList<JSONObject>();
+        logger.info("   Retrieving " + url);
+        Document doc = Jsoup.connect(url)
+                            .ignoreContentType(true)
+                            .header("Authorization", "Bearer " + accessToken)
+                            .header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
+                            .header("User-agent", "ripe and zipe")
+                            .get();
+        String body = doc.body().html().replaceAll("&quot;", "\"");
+        Object jsonObj = new JSONTokener(body).nextValue();
+        JSONArray statuses;
+        if (jsonObj instanceof JSONObject) {
+            JSONObject json = (JSONObject) jsonObj;
+            if (json.has("errors")) {
+                String msg = json.getJSONObject("errors").getString("message");
+                throw new IOException("Twitter responded with errors: " + msg);
+            }
+            statuses = json.getJSONArray("statuses");
+        } else {
+            statuses = (JSONArray) jsonObj;
+        }
+        for (int i = 0; i < statuses.length(); i++) {
+            tweets.add((JSONObject) statuses.get(i));
+        }
+        return tweets;
+    }
+
+    private void parseTweet(JSONObject tweet) throws MalformedURLException {
+        if (!tweet.has("entities")) {
+            logger.error("XXX Tweet doesn't have entitites");
+            return;
+        }
+
+        JSONObject entities = tweet.getJSONObject("entities");
+
+        if (entities.has("media")) {
+            JSONArray medias = entities.getJSONArray("media");
+            String url;
+            JSONObject media;
+            for (int i = 0; i < medias.length(); i++) {
+                media = (JSONObject) medias.get(i);
+                url = media.getString("media_url");
+                if (url.contains(".twimg.com/")) {
+                    url += ":large";
+                }
+                addURLToDownload(new URL(url));
+            }
+        }
+
+        if (entities.has("urls")) {
+            JSONArray urls = entities.getJSONArray("urls");
+            JSONObject url;
+            for (int i = 0; i < urls.length(); i++) {
+                url = (JSONObject) urls.get(i);
+                if (url.get("expanded_url") != null) {
+                    handleTweetedURL(url.getString("url"));
+                } else {
+                    handleTweetedURL(url.getString("expanded_url"));
+                }
+            }
+        }
+    }
+    
+    private void handleTweetedURL(String url) {
+        logger.error("[!] Need to handle URL: " + url);
+    }
+
+    @Override
+    public void rip() throws IOException {
+        getAccessToken();
+
+        switch (albumType) {
+        case ACCOUNT:
+            checkRateLimits("statuses", "/statuses/user_timeline");
+            break;
+        case SEARCH:
+            checkRateLimits("search", "/search/tweets");
+            break;
+        }
+
+        String maxID = null;
+        for (int i = 0; i < MAX_REQUESTS; i++) {
+            List<JSONObject> tweets = getTweets(getApiURL(maxID));
+            if (tweets.size() == 0) {
+                logger.info("   No more tweets found.");
+                break;
+            }
+            for (JSONObject tweet : tweets) {
+                maxID = tweet.getString("id_str");
+                parseTweet(tweet);
+            }
+
+            try {
+                Thread.sleep(WAIT_TIME);
+            } catch (InterruptedException e) {
+                logger.error("[!] Interrupted while waiting to load more results", e);
+                break;
+            }
+        }
+
+        waitForThreads();
+    }
+
+    @Override
+    public String getHost() {
+        return HOST;
+    }
+
+    @Override
+    public String getGID(URL url) throws MalformedURLException {
+        switch (albumType) {
+        case ACCOUNT:
+            return "account_" + accountName;
+        case SEARCH:
+            StringBuilder gid = new StringBuilder();
+            for (int i = 0; i < searchText.length(); i++) {
+                char c = searchText.charAt(i);
+                // Ignore URL-encoded chars
+                if (c == '%') {
+                    gid.append('_');
+                    i += 2;
+                    continue;
+                // Ignore non-alphanumeric chars
+                } else if (
+                        (c >= 'a' && c <= 'z')
+                     || (c >= 'A' && c <= 'Z') 
+                     || (c >= '0' && c <= '9') 
+                        ) {
+                    gid.append(c);
+                }
+            }
+            return "search_" + gid.toString();
+        }
+        throw new MalformedURLException("Could not decide type of URL (search/account): " + url);
+    }
+
+}
--- a/src/main/resources/rip.properties
+++ b/src/main/resources/rip.properties
@ -1,3 +1,4 @@
 threads.size = 5
 file.overwrite = false
 download.retries = 3
+twitter.auth = VW9Ybjdjb1pkd2J0U3kwTUh2VXVnOm9GTzVQVzNqM29LQU1xVGhnS3pFZzhKbGVqbXU0c2lHQ3JrUFNNZm8=
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java
@ -0,0 +1,28 @@
+package com.rarchives.ripme.tst.ripper.rippers;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import com.rarchives.ripme.ripper.rippers.TwitterRipper;
+
+public class TwitterRipperTest extends RippersTest {
+
+    public void testTwitterAlbums() throws IOException {
+        List<URL> contentURLs = new ArrayList<URL>();
+        //contentURLs.add(new URL("https://twitter.com/danngamber01/media"));
+        contentURLs.add(new URL("https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd"));
+        for (URL url : contentURLs) {
+            try {
+                TwitterRipper ripper = new TwitterRipper(url);
+                ripper.rip();
+                assert(ripper.getWorkingDir().listFiles().length > 1);
+                deleteDir(ripper.getWorkingDir());
+            } catch (Exception e) {
+                e.printStackTrace();
+                fail("Error while ripping URL " + url + ": " + e.getMessage());
+            }
+        }
+    }
+}