From 545bfce7c9f8b8845c617954b65cd934d7af1a75 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Thu, 15 Mar 2018 13:18:29 -0400 Subject: [PATCH] Insagram ripper now can get all pages of a profile --- .../ripme/ripper/rippers/InstagramRipper.java | 93 ++++++++++++++----- 1 file changed, 69 insertions(+), 24 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index e78834ba..93de0e8c 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -1,8 +1,11 @@ package com.rarchives.ripme.ripper.rippers; +import java.io.BufferedReader; import java.io.IOException; +import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; +import java.net.URLConnection; import java.time.*; import java.time.format.DateTimeFormatter; import java.util.ArrayList; @@ -25,6 +28,7 @@ import com.rarchives.ripme.utils.Utils; public class InstagramRipper extends AbstractHTMLRipper { String nextPageID = ""; + private String qHash; private String userID; @@ -136,7 +140,21 @@ public class InstagramRipper extends AbstractHTMLRipper { throw new MalformedURLException("Unable to find user in " + url); } + private String stripHTMLTags(String t) { + t = t.replaceAll("\n" + + " \n" + + " ", ""); + t.replaceAll("\n" + + "", ""); + return t; + } + + private JSONObject getJSONFromPage(Document firstPage) throws IOException { + // Check if this page is HTML + JSON or jsut json + if (!firstPage.html().contains("window._sharedData =")) { + return new JSONObject(stripHTMLTags(firstPage.html())); + } String jsonText = ""; try { for (Element script : firstPage.select("script[type=text/javascript]")) { @@ -153,8 +171,10 @@ public class InstagramRipper extends AbstractHTMLRipper { @Override public Document getFirstPage() throws IOException { - userID = getGID(url); - return Http.url(url).get(); + Document p = Http.url(url).get(); + // Get the query hash so we can download the next page + qHash = getQHash(p); + return p; } private String getVideoFromPage(String videoID) { @@ -210,14 +230,15 @@ public class InstagramRipper extends AbstractHTMLRipper { if (!url.toExternalForm().contains("/p/")) { JSONArray datas = new JSONArray(); + // This first try only works on data from the first page try { JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage"); + userID = profilePage.getJSONObject(0).getString("logging_page_id").replaceAll("profilePage_", ""); datas = profilePage.getJSONObject(0).getJSONObject("graphql").getJSONObject("user") .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges"); } catch (JSONException e) { - // Handle hashtag pages - datas = json.getJSONObject("entry_data").getJSONArray("TagPage").getJSONObject(0) - .getJSONObject("tag").getJSONObject("media").getJSONArray("nodes"); + datas = json.getJSONObject("data").getJSONObject("user") + .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges"); } for (int i = 0; i < datas.length(); i++) { JSONObject data = (JSONObject) datas.get(i); @@ -281,14 +302,11 @@ public class InstagramRipper extends AbstractHTMLRipper { // Sleep for a while to avoid a ban sleep(2500); if (url.toExternalForm().substring(url.toExternalForm().length() - 1).equals("/")) { - toreturn = Http.url(url.toExternalForm() + "?max_id=" + nextPageID).get(); + toreturn = Http.url(url.toExternalForm() + "?max_id=" + nextPageID).ignoreContentType().get(); } else { - toreturn = Http.url(url.toExternalForm() + "/?max_id=" + nextPageID).get(); + toreturn = Http.url(url.toExternalForm() + "/?max_id=" + nextPageID).ignoreContentType().get(); } logger.info(toreturn.html()); - if (!hasImage(toreturn)) { - throw new IOException("No more pages"); - } return toreturn; } catch (IOException e) { @@ -299,8 +317,9 @@ public class InstagramRipper extends AbstractHTMLRipper { try { // Sleep for a while to avoid a ban sleep(2500); - toreturn = Http.url("https://www.instagram.com/" + userID + "/?max_id=" + nextPageID).get(); - if (!hasImage(toreturn)) { + toreturn = Http.url("https://www.instagram.com/graphql/query/?query_hash=" + qHash + "&variables=" + + "{\"id\":\"" + userID + "\",\"first\":100,\"after\":\"" + nextPageID + "\"}").ignoreContentType().get(); + if (!pageHasImages(toreturn)) { throw new IOException("No more pages"); } return toreturn; @@ -317,20 +336,46 @@ public class InstagramRipper extends AbstractHTMLRipper { addURLToDownload(url); } - private boolean hasImage(Document doc) { - try { - JSONObject json = getJSONFromPage(doc); - JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage"); - JSONArray datas = profilePage.getJSONObject(0).getJSONObject("graphql").getJSONObject("user") - .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges"); - logger.info(datas.length()); - if (datas.length() == 0) { - return false; - } - return true; - } catch (IOException e) { + private boolean pageHasImages(Document doc) { + JSONObject json = new JSONObject(stripHTMLTags(doc.html())); + int numberOfImages = json.getJSONObject("data").getJSONObject("user") + .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length(); + if (numberOfImages == 0) { return false; } + return true; + } + + private String getQHash(Document doc) { + String jsFileURL = "https://www.instagram.com" + doc.select("link[rel=preload]").attr("href"); + StringBuilder sb = new StringBuilder(); + Document jsPage; + try { + // We can't use Jsoup here because it won't download a non-html file larger than a MB + // even if you set maxBodySize to 0 + URLConnection connection = new URL(jsFileURL).openConnection(); + BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream())); + String line; + while ((line = in.readLine()) != null) { + sb.append(line); + } + in.close(); + + } catch (MalformedURLException e) { + logger.info("Unable to get query_hash, " + jsFileURL + " is a malformed URL"); + return null; + } catch (IOException e) { + logger.info("Unable to get query_hash"); + logger.info(e.getMessage()); + return null; + } + Pattern jsP = Pattern.compile("o},queryId:.([a-zA-Z0-9]+)."); + Matcher m = jsP.matcher(sb.toString()); + if (m.find()) { + return m.group(1); + } + logger.info("Could not find query_hash on " + jsFileURL); + return null; }