From 617d1b9a14296d5b71f3d4bffd1101516019f302 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Sun, 20 May 2018 10:21:45 -0400 Subject: [PATCH] Fixed instagram ripper using a total hack --- .../ripme/ripper/rippers/InstagramRipper.java | 42 +++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index efc4cb40..a3f9c2d2 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -23,6 +23,7 @@ import com.rarchives.ripme.ripper.AbstractHTMLRipper; import com.rarchives.ripme.utils.Http; import org.jsoup.Connection; +import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import com.rarchives.ripme.ui.RipStatusMessage; @@ -39,6 +40,9 @@ public class InstagramRipper extends AbstractHTMLRipper { private String userID; private String rhx_gis = null; private String csrftoken; + // Run into a weird issue with Jsoup cutting some json pages in half, this is a work around + // see https://github.com/RipMeApp/ripme/issues/601 + private String workAroundJsonString; @@ -242,6 +246,10 @@ public class InstagramRipper extends AbstractHTMLRipper { json = getJSONFromPage(doc); } catch (IOException e) { logger.warn("Unable to exact json from page"); + } catch (JSONException e) { + // IF there's a json error it's almost certianly because our json string got cut off while being turned + // into a doc, so we try this work around + json = new JSONObject(workAroundJsonString); } // get the rhx_gis value so we can get the next page later on @@ -377,8 +385,8 @@ public class InstagramRipper extends AbstractHTMLRipper { String vars = "{\"id\":\"" + userID + "\",\"first\":50,\"after\":\"" + nextPageID + "\"}"; String ig_gis = getIGGis(vars); logger.info(ig_gis); - toreturn = Http.url("https://www.instagram.com/graphql/query/?query_hash=" + qHash + "&variables=" + vars - ).header("x-instagram-gis", ig_gis).cookies(cookies).ignoreContentType().get(); + + toreturn = getPage("https://www.instagram.com/graphql/query/?query_hash=" + qHash + "&variables=" + vars, ig_gis); if (!pageHasImages(toreturn)) { throw new IOException("No more pages"); } @@ -397,7 +405,7 @@ public class InstagramRipper extends AbstractHTMLRipper { } private boolean pageHasImages(Document doc) { - JSONObject json = new JSONObject(stripHTMLTags(doc.html())); + JSONObject json = new JSONObject(workAroundJsonString); int numberOfImages = json.getJSONObject("data").getJSONObject("user") .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length(); if (numberOfImages == 0) { @@ -406,6 +414,34 @@ public class InstagramRipper extends AbstractHTMLRipper { return true; } + private Document getPage(String url, String ig_gis) { + StringBuilder sb = new StringBuilder(); + try { + // We can't use Jsoup here because it won't download a non-html file larger than a MB + // even if you set maxBodySize to 0 + URLConnection connection = new URL(url).openConnection(); + connection.setRequestProperty("User-Agent", USER_AGENT); + connection.setRequestProperty("x-instagram-gis", ig_gis); + BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream())); + String line; + while ((line = in.readLine()) != null) { + sb.append(line); + + } + in.close(); + workAroundJsonString = sb.toString(); + return Jsoup.parse(sb.toString()); + + } catch (MalformedURLException e) { + logger.info("Unable to get query_hash, " + url + " is a malformed URL"); + return null; + } catch (IOException e) { + logger.info("Unable to get query_hash"); + logger.info(e.getMessage()); + return null; + } + } + private String getQHash(Document doc) { String jsFileURL = "https://www.instagram.com" + doc.select("link[rel=preload]").attr("href"); StringBuilder sb = new StringBuilder();