diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index efc4cb40..12227418 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -19,10 +19,11 @@ import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; -import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.ripper.AbstractJSONRipper; import com.rarchives.ripme.utils.Http; import org.jsoup.Connection; +import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import com.rarchives.ripme.ui.RipStatusMessage; @@ -30,7 +31,7 @@ import com.rarchives.ripme.utils.Utils; import java.util.HashMap; -public class InstagramRipper extends AbstractHTMLRipper { +public class InstagramRipper extends AbstractJSONRipper { String nextPageID = ""; private String qHash; private boolean rippingTag = false; @@ -39,6 +40,9 @@ public class InstagramRipper extends AbstractHTMLRipper { private String userID; private String rhx_gis = null; private String csrftoken; + // Run into a weird issue with Jsoup cutting some json pages in half, this is a work around + // see https://github.com/RipMeApp/ripme/issues/601 + private String workAroundJsonString; @@ -73,11 +77,9 @@ public class InstagramRipper extends AbstractHTMLRipper { return url.replaceAll("/[A-Z0-9]{8}/", "/"); } - private List getPostsFromSinglePage(Document Doc) { + private List getPostsFromSinglePage(JSONObject json) { List imageURLs = new ArrayList<>(); JSONArray datas; - try { - JSONObject json = getJSONFromPage(Doc); if (json.getJSONObject("entry_data").getJSONArray("PostPage") .getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media") .has("edge_sidecar_to_children")) { @@ -103,10 +105,6 @@ public class InstagramRipper extends AbstractHTMLRipper { } } return imageURLs; - } catch (IOException e) { - logger.error("Unable to get JSON from page " + url.toExternalForm()); - return null; - } } @Override @@ -184,14 +182,14 @@ public class InstagramRipper extends AbstractHTMLRipper { } @Override - public Document getFirstPage() throws IOException { + public JSONObject getFirstPage() throws IOException { Connection.Response resp = Http.url(url).response(); logger.info(resp.cookies()); csrftoken = resp.cookie("csrftoken"); Document p = resp.parse(); // Get the query hash so we can download the next page qHash = getQHash(p); - return p; + return getJSONFromPage(p); } private String getVideoFromPage(String videoID) { @@ -235,14 +233,8 @@ public class InstagramRipper extends AbstractHTMLRipper { } @Override - public List getURLsFromPage(Document doc) { + public List getURLsFromJSON(JSONObject json) { List imageURLs = new ArrayList<>(); - JSONObject json = new JSONObject(); - try { - json = getJSONFromPage(doc); - } catch (IOException e) { - logger.warn("Unable to exact json from page"); - } // get the rhx_gis value so we can get the next page later on if (rhx_gis == null) { @@ -282,7 +274,7 @@ public class InstagramRipper extends AbstractHTMLRipper { if (data.getString("__typename").equals("GraphSidecar")) { try { Document slideShowDoc = Http.url(new URL("https://www.instagram.com/p/" + data.getString("shortcode"))).get(); - List toAdd = getPostsFromSinglePage(slideShowDoc); + List toAdd = getPostsFromSinglePage(getJSONFromPage(slideShowDoc)); for (int slideShowInt = 0; slideShowInt < toAdd.size(); slideShowInt++) { addURLToDownload(new URL(toAdd.get(slideShowInt)), image_date + data.getString("shortcode")); } @@ -321,7 +313,7 @@ public class InstagramRipper extends AbstractHTMLRipper { } else { // We're ripping from a single page logger.info("Ripping from single page"); - imageURLs = getPostsFromSinglePage(doc); + imageURLs = getPostsFromSinglePage(json); } return imageURLs; @@ -348,8 +340,8 @@ public class InstagramRipper extends AbstractHTMLRipper { } @Override - public Document getNextPage(Document doc) throws IOException { - Document toreturn; + public JSONObject getNextPage(JSONObject json) throws IOException { + JSONObject toreturn; java.util.Map cookies = new HashMap(); // This shouldn't be hardcoded and will break one day cookies.put("ig_pr", "1"); @@ -360,10 +352,13 @@ public class InstagramRipper extends AbstractHTMLRipper { sleep(2500); String vars = "{\"tag_name\":\"" + tagName + "\",\"first\":4,\"after\":\"" + nextPageID + "\"}"; String ig_gis = getIGGis(vars); - toreturn = Http.url("https://www.instagram.com/graphql/query/?query_hash=" + qHash + - "&variables=" + vars).header("x-instagram-gis", ig_gis).cookies(cookies).ignoreContentType().get(); + toreturn = getPage("https://www.instagram.com/graphql/query/?query_hash=" + qHash + + "&variables=" + vars, ig_gis); // Sleep for a while to avoid a ban - logger.info(toreturn.html()); + logger.info(toreturn); + if (!pageHasImages(toreturn)) { + throw new IOException("No more pages"); + } return toreturn; } catch (IOException e) { @@ -377,8 +372,8 @@ public class InstagramRipper extends AbstractHTMLRipper { String vars = "{\"id\":\"" + userID + "\",\"first\":50,\"after\":\"" + nextPageID + "\"}"; String ig_gis = getIGGis(vars); logger.info(ig_gis); - toreturn = Http.url("https://www.instagram.com/graphql/query/?query_hash=" + qHash + "&variables=" + vars - ).header("x-instagram-gis", ig_gis).cookies(cookies).ignoreContentType().get(); + + toreturn = getPage("https://www.instagram.com/graphql/query/?query_hash=" + qHash + "&variables=" + vars, ig_gis); if (!pageHasImages(toreturn)) { throw new IOException("No more pages"); } @@ -396,8 +391,7 @@ public class InstagramRipper extends AbstractHTMLRipper { addURLToDownload(url); } - private boolean pageHasImages(Document doc) { - JSONObject json = new JSONObject(stripHTMLTags(doc.html())); + private boolean pageHasImages(JSONObject json) { int numberOfImages = json.getJSONObject("data").getJSONObject("user") .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length(); if (numberOfImages == 0) { @@ -406,6 +400,34 @@ public class InstagramRipper extends AbstractHTMLRipper { return true; } + private JSONObject getPage(String url, String ig_gis) { + StringBuilder sb = new StringBuilder(); + try { + // We can't use Jsoup here because it won't download a non-html file larger than a MB + // even if you set maxBodySize to 0 + URLConnection connection = new URL(url).openConnection(); + connection.setRequestProperty("User-Agent", USER_AGENT); + connection.setRequestProperty("x-instagram-gis", ig_gis); + BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream())); + String line; + while ((line = in.readLine()) != null) { + sb.append(line); + + } + in.close(); + workAroundJsonString = sb.toString(); + return new JSONObject(sb.toString()); + + } catch (MalformedURLException e) { + logger.info("Unable to get query_hash, " + url + " is a malformed URL"); + return null; + } catch (IOException e) { + logger.info("Unable to get query_hash"); + logger.info(e.getMessage()); + return null; + } + } + private String getQHash(Document doc) { String jsFileURL = "https://www.instagram.com" + doc.select("link[rel=preload]").attr("href"); StringBuilder sb = new StringBuilder();