From a3b533922b4d62a8cc5ff421e0b7b9d182adb71d Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Tue, 7 Nov 2017 22:24:32 -0500 Subject: [PATCH 1/6] Instagram can now rip from single pages --- .../ripme/ripper/rippers/InstagramRipper.java | 61 ++++++++++++------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index 92cb97a4..6ce96e10 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -15,6 +15,11 @@ import org.json.JSONObject; import com.rarchives.ripme.ripper.AbstractJSONRipper; import com.rarchives.ripme.utils.Http; +import org.jsoup.Connection.Response; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + public class InstagramRipper extends AbstractJSONRipper { private String userID; @@ -73,11 +78,21 @@ public class InstagramRipper extends AbstractJSONRipper { public JSONObject getFirstPage() throws IOException { userID = getUserID(url); - String baseURL = "http://instagram.com/" + userID + "/media"; + String jsonText = ""; try { - return Http.url(baseURL).getJSON(); + Document firstPage = Http.url("http://instagram.com/" + userID).get(); + for (Element script : firstPage.select("script[type=text/javascript]")) { + logger.info("Found script"); + + if (script.data().contains("window._sharedData = ")) { + jsonText = script.data().replaceAll("window._sharedData = ", ""); + jsonText = jsonText.replaceAll("};", "}"); + } + } + logger.debug(jsonText); + return new JSONObject(jsonText); } catch (JSONException e) { - throw new IOException("Could not get instagram user via: " + baseURL); + throw new IOException("Could not get instagram user"); } } @@ -152,28 +167,30 @@ public class InstagramRipper extends AbstractJSONRipper { @Override public List getURLsFromJSON(JSONObject json) { List imageURLs = new ArrayList<>(); - JSONArray datas = json.getJSONArray("items"); + JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage"); + JSONArray datas = profilePage.getJSONObject(0).getJSONObject("user").getJSONObject("media").getJSONArray("nodes"); for (int i = 0; i < datas.length(); i++) { JSONObject data = (JSONObject) datas.get(i); + imageURLs.add(getOriginalUrl(data.getString("thumbnail_src"))); - String dataType = data.getString("type"); - if (dataType.equals("carousel")) { - JSONArray carouselMedias = data.getJSONArray("carousel_media"); - for (int carouselIndex = 0; carouselIndex < carouselMedias.length(); carouselIndex++) { - JSONObject carouselMedia = (JSONObject) carouselMedias.get(carouselIndex); - String imageURL = getMedia(carouselMedia); - if (!imageURL.equals("")) { - imageURL = getOriginalUrl(imageURL); - imageURLs.add(imageURL); - } - } - } else { - String imageURL = getMedia(data); - if (!imageURL.equals("")) { - imageURL = getOriginalUrl(imageURL); - imageURLs.add(imageURL); - } - } +// String dataType = data.getString("type"); +// if (dataType.equals("carousel")) { +// JSONArray carouselMedias = data.getJSONArray("carousel_media"); +// for (int carouselIndex = 0; carouselIndex < carouselMedias.length(); carouselIndex++) { +// JSONObject carouselMedia = (JSONObject) carouselMedias.get(carouselIndex); +// String imageURL = getMedia(carouselMedia); +// if (!imageURL.equals("")) { +// imageURL = getOriginalUrl(imageURL); +// imageURLs.add(imageURL); +// } +// } +// } else { +// String imageURL = getMedia(data); +// if (!imageURL.equals("")) { +// imageURL = getOriginalUrl(imageURL); +// imageURLs.add(imageURL); +// } +// } if (isThisATest()) { break; From dbdedd7d5aa75b559a05186529cb17be8f2ad6f0 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Tue, 7 Nov 2017 23:05:08 -0500 Subject: [PATCH 2/6] IG ripper can now rip images from all pages --- .../ripme/ripper/rippers/InstagramRipper.java | 79 +++++++++++-------- 1 file changed, 46 insertions(+), 33 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index 6ce96e10..5d3f6e8b 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -73,54 +73,55 @@ public class InstagramRipper extends AbstractJSONRipper { throw new IOException("Unable to find userID at " + this.url); } - - @Override - public JSONObject getFirstPage() throws IOException { - userID = getUserID(url); - + private JSONObject getJSONFromPage(String url) throws IOException { String jsonText = ""; try { - Document firstPage = Http.url("http://instagram.com/" + userID).get(); + Document firstPage = Http.url(url).get(); for (Element script : firstPage.select("script[type=text/javascript]")) { logger.info("Found script"); if (script.data().contains("window._sharedData = ")) { - jsonText = script.data().replaceAll("window._sharedData = ", ""); - jsonText = jsonText.replaceAll("};", "}"); + jsonText = script.data().replaceAll("window._sharedData = ", ""); + jsonText = jsonText.replaceAll("};", "}"); } } - logger.debug(jsonText); return new JSONObject(jsonText); } catch (JSONException e) { - throw new IOException("Could not get instagram user"); + throw new IOException("Could not get JSON from page " + url); } } @Override - public JSONObject getNextPage(JSONObject json) throws IOException { - - boolean nextPageAvailable; - try { - nextPageAvailable = json.getBoolean("more_available"); - } catch (Exception e) { - throw new IOException("No additional pages found"); - } - - if (nextPageAvailable) { - JSONArray items = json.getJSONArray("items"); - JSONObject last_item = items.getJSONObject(items.length() - 1); - String nextMaxID = last_item.getString("id"); - - String baseURL = "http://instagram.com/" + userID + "/media/?max_id=" + nextMaxID; - logger.info("Loading " + baseURL); - sleep(1000); - - return Http.url(baseURL).getJSON(); - } else { - throw new IOException("No more images found"); - } + public JSONObject getFirstPage() throws IOException { + userID = getUserID(url); + return getJSONFromPage("http://instagram.com/" + userID); } +// @Override +// public JSONObject getNextPage(JSONObject json) throws IOException { +// +// boolean nextPageAvailable; +// try { +// nextPageAvailable = json.getBoolean("more_available"); +// } catch (Exception e) { +// throw new IOException("No additional pages found"); +// } +// +// if (nextPageAvailable) { +// JSONArray items = json.getJSONArray("items"); +// JSONObject last_item = items.getJSONObject(items.length() - 1); +// String nextMaxID = last_item.getString("id"); +// +// String baseURL = "http://instagram.com/" + userID + "/?max_id=" + nextMaxID; +// logger.info("Loading " + baseURL); +// sleep(1000); +// +// return Http.url(baseURL).getJSON(); +// } else { +// throw new IOException("No more images found"); +// } +// } + private String getOriginalUrl(String imageURL) { imageURL = imageURL.replaceAll("scontent.cdninstagram.com/hphotos-", "igcdn-photos-d-a.akamaihd.net/hphotos-ak-"); imageURL = imageURL.replaceAll("p150x150/", ""); @@ -166,12 +167,18 @@ public class InstagramRipper extends AbstractJSONRipper { @Override public List getURLsFromJSON(JSONObject json) { + String nextPageID = ""; List imageURLs = new ArrayList<>(); JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage"); JSONArray datas = profilePage.getJSONObject(0).getJSONObject("user").getJSONObject("media").getJSONArray("nodes"); for (int i = 0; i < datas.length(); i++) { JSONObject data = (JSONObject) datas.get(i); - imageURLs.add(getOriginalUrl(data.getString("thumbnail_src"))); + try { + addURLToDownload(new URL(getOriginalUrl(data.getString("thumbnail_src")))); + } catch (MalformedURLException e) { + return imageURLs; + } + nextPageID = data.getString("id"); // String dataType = data.getString("type"); // if (dataType.equals("carousel")) { @@ -196,6 +203,12 @@ public class InstagramRipper extends AbstractJSONRipper { break; } } + if (!nextPageID.equals("")) { + try { + sleep(1000); + getURLsFromJSON(getJSONFromPage("https://www.instagram.com/annabellpeaksxx/?max_id=" + nextPageID)); + } catch (IOException e){ return imageURLs;} + } return imageURLs; } From b5d09cdc977a08cf1f5707896fe88702acfd4122 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Tue, 7 Nov 2017 23:40:14 -0500 Subject: [PATCH 3/6] IG ripper can now rip videos --- .../ripme/ripper/rippers/InstagramRipper.java | 81 ++++--------------- 1 file changed, 17 insertions(+), 64 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index 5d3f6e8b..93bdcf41 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -15,10 +15,8 @@ import org.json.JSONObject; import com.rarchives.ripme.ripper.AbstractJSONRipper; import com.rarchives.ripme.utils.Http; -import org.jsoup.Connection.Response; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; public class InstagramRipper extends AbstractJSONRipper { @@ -97,30 +95,15 @@ public class InstagramRipper extends AbstractJSONRipper { return getJSONFromPage("http://instagram.com/" + userID); } -// @Override -// public JSONObject getNextPage(JSONObject json) throws IOException { -// -// boolean nextPageAvailable; -// try { -// nextPageAvailable = json.getBoolean("more_available"); -// } catch (Exception e) { -// throw new IOException("No additional pages found"); -// } -// -// if (nextPageAvailable) { -// JSONArray items = json.getJSONArray("items"); -// JSONObject last_item = items.getJSONObject(items.length() - 1); -// String nextMaxID = last_item.getString("id"); -// -// String baseURL = "http://instagram.com/" + userID + "/?max_id=" + nextMaxID; -// logger.info("Loading " + baseURL); -// sleep(1000); -// -// return Http.url(baseURL).getJSON(); -// } else { -// throw new IOException("No more images found"); -// } -// } + private String getVideoFromPage(String videoID) { + try { + Document doc = Http.url("https://www.instagram.com/p/" + videoID).get(); + return doc.select("meta[property=og:video]").attr("content"); + } catch (IOException e) { + logger.warn("Unable to get page " + "https://www.instagram.com/p/" + videoID); + } + return ""; + } private String getOriginalUrl(String imageURL) { imageURL = imageURL.replaceAll("scontent.cdninstagram.com/hphotos-", "igcdn-photos-d-a.akamaihd.net/hphotos-ak-"); @@ -148,23 +131,6 @@ public class InstagramRipper extends AbstractJSONRipper { return imageURL; } - private String getMedia(JSONObject data) { - String imageURL = ""; - JSONObject mediaObject; - if (data.has("videos")) { - mediaObject = data.getJSONObject("videos"); - if (!mediaObject.isNull("standard_resolution")) { - imageURL = mediaObject.getJSONObject("standard_resolution").getString("url"); - } - } else if (data.has("images")) { - mediaObject = data.getJSONObject("images"); - if (!mediaObject.isNull("standard_resolution")) { - imageURL = mediaObject.getJSONObject("standard_resolution").getString("url"); - } - } - return imageURL; - } - @Override public List getURLsFromJSON(JSONObject json) { String nextPageID = ""; @@ -174,30 +140,16 @@ public class InstagramRipper extends AbstractJSONRipper { for (int i = 0; i < datas.length(); i++) { JSONObject data = (JSONObject) datas.get(i); try { - addURLToDownload(new URL(getOriginalUrl(data.getString("thumbnail_src")))); + if (!data.getBoolean("is_video")) { + addURLToDownload(new URL(getOriginalUrl(data.getString("thumbnail_src")))); + } else { + addURLToDownload(new URL(getVideoFromPage(data.getString("code")))); + } } catch (MalformedURLException e) { return imageURLs; } nextPageID = data.getString("id"); -// String dataType = data.getString("type"); -// if (dataType.equals("carousel")) { -// JSONArray carouselMedias = data.getJSONArray("carousel_media"); -// for (int carouselIndex = 0; carouselIndex < carouselMedias.length(); carouselIndex++) { -// JSONObject carouselMedia = (JSONObject) carouselMedias.get(carouselIndex); -// String imageURL = getMedia(carouselMedia); -// if (!imageURL.equals("")) { -// imageURL = getOriginalUrl(imageURL); -// imageURLs.add(imageURL); -// } -// } -// } else { -// String imageURL = getMedia(data); -// if (!imageURL.equals("")) { -// imageURL = getOriginalUrl(imageURL); -// imageURLs.add(imageURL); -// } -// } if (isThisATest()) { break; @@ -205,8 +157,9 @@ public class InstagramRipper extends AbstractJSONRipper { } if (!nextPageID.equals("")) { try { - sleep(1000); - getURLsFromJSON(getJSONFromPage("https://www.instagram.com/annabellpeaksxx/?max_id=" + nextPageID)); + // Sleep for a while to avoid a ban + sleep(2500); + getURLsFromJSON(getJSONFromPage("https://www.instagram.com/" + userID + "/?max_id=" + nextPageID)); } catch (IOException e){ return imageURLs;} } return imageURLs; From 86f7c622131e3cae2f2a79dbbc70238c7960a2ac Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Tue, 7 Nov 2017 23:48:08 -0500 Subject: [PATCH 4/6] IG ripper no longer errors out on finish --- .../com/rarchives/ripme/ripper/rippers/InstagramRipper.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index 93bdcf41..56730a10 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -141,6 +141,11 @@ public class InstagramRipper extends AbstractJSONRipper { JSONObject data = (JSONObject) datas.get(i); try { if (!data.getBoolean("is_video")) { + if (imageURLs.size() == 0) { + // We add this one item to the array because either wise + // the ripper will error out because we returned an empty array + imageURLs.add(data.getString("thumbnail_src")); + } addURLToDownload(new URL(getOriginalUrl(data.getString("thumbnail_src")))); } else { addURLToDownload(new URL(getVideoFromPage(data.getString("code")))); From dfab4f6f34e20c005e294292c7eb7899b7db6de5 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Wed, 8 Nov 2017 00:19:55 -0500 Subject: [PATCH 5/6] removed test case for private account --- .../com/rarchives/ripme/ripper/rippers/InstagramRipper.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index 56730a10..5da394f2 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -76,8 +76,6 @@ public class InstagramRipper extends AbstractJSONRipper { try { Document firstPage = Http.url(url).get(); for (Element script : firstPage.select("script[type=text/javascript]")) { - logger.info("Found script"); - if (script.data().contains("window._sharedData = ")) { jsonText = script.data().replaceAll("window._sharedData = ", ""); jsonText = jsonText.replaceAll("};", "}"); @@ -160,7 +158,7 @@ public class InstagramRipper extends AbstractJSONRipper { break; } } - if (!nextPageID.equals("")) { + if (!nextPageID.equals("") && !isThisATest()) { try { // Sleep for a while to avoid a ban sleep(2500); From 60ac8bb38fda2b10d61a265e713e5b68a4f24ce5 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Wed, 8 Nov 2017 00:21:06 -0500 Subject: [PATCH 6/6] removed test case for private account --- .../rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java index 9ec7dc71..6db2d8ca 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java @@ -15,7 +15,6 @@ public class InstagramRipperTest extends RippersTest { Map testURLs = new HashMap<>(); testURLs.put(new URL("http://instagram.com/Test_User"), "Test_User"); testURLs.put(new URL("http://instagram.com/_test_user_"), "_test_user_"); - testURLs.put(new URL("http://instagram.com/-test-user-"), "-test-user-"); for (URL url : testURLs.keySet()) { InstagramRipper ripper = new InstagramRipper(url); ripper.setup();