From 7f57768f3da3fb73bf26c235df4d3de8d7944346 Mon Sep 17 00:00:00 2001 From: cyian-1756 <devnull64@vfemail.net> Date: Sun, 12 Nov 2017 03:15:56 -0500 Subject: [PATCH 1/3] instagram ripper now puts the date the image was posted in the saved file name --- .../ripme/ripper/rippers/InstagramRipper.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index 5da394f2..1824ba8f 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -3,6 +3,8 @@ package com.rarchives.ripme.ripper.rippers; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import java.time.*; +import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; @@ -18,6 +20,7 @@ import com.rarchives.ripme.utils.Http; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; + public class InstagramRipper extends AbstractJSONRipper { private String userID; @@ -137,6 +140,9 @@ public class InstagramRipper extends AbstractJSONRipper { JSONArray datas = profilePage.getJSONObject(0).getJSONObject("user").getJSONObject("media").getJSONArray("nodes"); for (int i = 0; i < datas.length(); i++) { JSONObject data = (JSONObject) datas.get(i); + Long epoch = data.getLong("date"); + Instant instant = Instant.ofEpochSecond(epoch); + String image_date = DateTimeFormatter.ofPattern("yyyy_MM_dd_hh:mm_").format(ZonedDateTime.ofInstant(instant, ZoneOffset.UTC)); try { if (!data.getBoolean("is_video")) { if (imageURLs.size() == 0) { @@ -144,9 +150,9 @@ public class InstagramRipper extends AbstractJSONRipper { // the ripper will error out because we returned an empty array imageURLs.add(data.getString("thumbnail_src")); } - addURLToDownload(new URL(getOriginalUrl(data.getString("thumbnail_src")))); + addURLToDownload(new URL(getOriginalUrl(data.getString("thumbnail_src"))), image_date); } else { - addURLToDownload(new URL(getVideoFromPage(data.getString("code")))); + addURLToDownload(new URL(getVideoFromPage(data.getString("code"))), image_date); } } catch (MalformedURLException e) { return imageURLs; From d1e021f0efc75c3642e06bc1dc45d713b9a1b995 Mon Sep 17 00:00:00 2001 From: cyian-1756 <devnull64@vfemail.net> Date: Sun, 12 Nov 2017 04:43:26 -0500 Subject: [PATCH 2/3] Added single post support to the instagram ripper --- .../ripme/ripper/rippers/InstagramRipper.java | 151 ++++++++++++------ 1 file changed, 99 insertions(+), 52 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index 1824ba8f..db987535 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -14,14 +14,14 @@ import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; -import com.rarchives.ripme.ripper.AbstractJSONRipper; +import com.rarchives.ripme.ripper.AbstractHTMLRipper; import com.rarchives.ripme.utils.Http; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -public class InstagramRipper extends AbstractJSONRipper { +public class InstagramRipper extends AbstractHTMLRipper { private String userID; @@ -45,24 +45,36 @@ public class InstagramRipper extends AbstractJSONRipper { @Override public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("^https?://instagram.com/([^/]+)"); + Pattern p = Pattern.compile("^https?://instagram.com/([^/]+)/?"); Matcher m = p.matcher(url.toExternalForm()); if (m.matches()) { return m.group(1); } + + p = Pattern.compile("^https?://www.instagram.com/([^/]+)/?"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1); + } + + p = Pattern.compile("^https?://www.instagram.com/p/[a-zA-Z0-9_-]+/\\?taken-by=([^/]+)/?"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1); + } throw new MalformedURLException("Unable to find user in " + url); } - @Override - public URL sanitizeURL(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("^.*instagram\\.com/([a-zA-Z0-9\\-_.]+).*$"); - Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return new URL("http://instagram.com/" + m.group(1)); - } - - throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url); - } +// @Override +// public URL sanitizeURL(URL url) throws MalformedURLException { +// Pattern p = Pattern.compile("^.*instagram\\.com/([a-zA-Z0-9\\-_.]+).*$"); +// Matcher m = p.matcher(url.toExternalForm()); +// if (m.matches()) { +// return new URL("http://instagram.com/" + m.group(1)); +// } +// +// throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url); +// } private String getUserID(URL url) throws IOException { @@ -72,12 +84,23 @@ public class InstagramRipper extends AbstractJSONRipper { return m.group(1); } + p = Pattern.compile("^https?://www.instagram.com/([^/]+)/?"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1); + } + + p = Pattern.compile("^https?://(www.)?instagram.com/p/[a-zA-Z0-9_-]+/\\?taken-by=([^/]+)"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1); + } + throw new IOException("Unable to find userID at " + this.url); } - private JSONObject getJSONFromPage(String url) throws IOException { + private JSONObject getJSONFromPage(Document firstPage) throws IOException { String jsonText = ""; try { - Document firstPage = Http.url(url).get(); for (Element script : firstPage.select("script[type=text/javascript]")) { if (script.data().contains("window._sharedData = ")) { jsonText = script.data().replaceAll("window._sharedData = ", ""); @@ -86,14 +109,14 @@ public class InstagramRipper extends AbstractJSONRipper { } return new JSONObject(jsonText); } catch (JSONException e) { - throw new IOException("Could not get JSON from page " + url); + throw new IOException("Could not get JSON from page"); } } @Override - public JSONObject getFirstPage() throws IOException { + public Document getFirstPage() throws IOException { userID = getUserID(url); - return getJSONFromPage("http://instagram.com/" + userID); + return Http.url(url).get(); } private String getVideoFromPage(String videoID) { @@ -133,43 +156,67 @@ public class InstagramRipper extends AbstractJSONRipper { } @Override - public List<String> getURLsFromJSON(JSONObject json) { + public List<String> getURLsFromPage(Document doc) { String nextPageID = ""; List<String> imageURLs = new ArrayList<>(); - JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage"); - JSONArray datas = profilePage.getJSONObject(0).getJSONObject("user").getJSONObject("media").getJSONArray("nodes"); - for (int i = 0; i < datas.length(); i++) { - JSONObject data = (JSONObject) datas.get(i); - Long epoch = data.getLong("date"); - Instant instant = Instant.ofEpochSecond(epoch); - String image_date = DateTimeFormatter.ofPattern("yyyy_MM_dd_hh:mm_").format(ZonedDateTime.ofInstant(instant, ZoneOffset.UTC)); - try { - if (!data.getBoolean("is_video")) { - if (imageURLs.size() == 0) { - // We add this one item to the array because either wise - // the ripper will error out because we returned an empty array - imageURLs.add(data.getString("thumbnail_src")); - } - addURLToDownload(new URL(getOriginalUrl(data.getString("thumbnail_src"))), image_date); - } else { - addURLToDownload(new URL(getVideoFromPage(data.getString("code"))), image_date); - } - } catch (MalformedURLException e) { - return imageURLs; - } - nextPageID = data.getString("id"); - - - if (isThisATest()) { - break; - } + JSONObject json = new JSONObject(); + try { + json = getJSONFromPage(doc); + } catch (IOException e) { + logger.warn("Unable to exact json from page"); } - if (!nextPageID.equals("") && !isThisATest()) { - try { - // Sleep for a while to avoid a ban - sleep(2500); - getURLsFromJSON(getJSONFromPage("https://www.instagram.com/" + userID + "/?max_id=" + nextPageID)); - } catch (IOException e){ return imageURLs;} + + Pattern p = Pattern.compile("^.*instagram\\.com/([a-zA-Z0-9\\-_.]+)/?"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage"); + JSONArray datas = profilePage.getJSONObject(0).getJSONObject("user").getJSONObject("media").getJSONArray("nodes"); + for (int i = 0; i < datas.length(); i++) { + JSONObject data = (JSONObject) datas.get(i); + Long epoch = data.getLong("date"); + Instant instant = Instant.ofEpochSecond(epoch); + String image_date = DateTimeFormatter.ofPattern("yyyy_MM_dd_hh:mm_").format(ZonedDateTime.ofInstant(instant, ZoneOffset.UTC)); + try { + if (!data.getBoolean("is_video")) { + if (imageURLs.size() == 0) { + // We add this one item to the array because either wise + // the ripper will error out because we returned an empty array + imageURLs.add(data.getString("thumbnail_src")); + } + addURLToDownload(new URL(getOriginalUrl(data.getString("thumbnail_src"))), image_date); + } else { + addURLToDownload(new URL(getVideoFromPage(data.getString("code"))), image_date); + } + } catch (MalformedURLException e) { + return imageURLs; + } + nextPageID = data.getString("id"); + + + if (isThisATest()) { + break; + } + } + // Rip the next page + if (!nextPageID.equals("") && !isThisATest()) { + try { + // Sleep for a while to avoid a ban + sleep(2500); + getURLsFromPage(Http.url("https://www.instagram.com/" + userID + "/?max_id=" + nextPageID).get()); + } catch (IOException e) { + return imageURLs; + } + } + } else { // We're ripping from a single page + if (!doc.select("meta[property=og:video]").attr("content").equals("")) { + String videoURL = doc.select("meta[property=og:video]").attr("content"); + // We're ripping a page with a video on it + imageURLs.add(videoURL); + } else { + // We're ripping a picture + imageURLs.add(doc.select("meta[property=og:image]").attr("content")); + } + } return imageURLs; } From 07f180ee703144b38fefab52ea6bf16a64a5e47c Mon Sep 17 00:00:00 2001 From: cyian-1756 <devnull64@vfemail.net> Date: Sun, 12 Nov 2017 05:00:39 -0500 Subject: [PATCH 3/3] Removed repetitive func --- .../ripme/ripper/rippers/InstagramRipper.java | 36 ++----------------- 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index db987535..657c6ee2 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -65,39 +65,6 @@ public class InstagramRipper extends AbstractHTMLRipper { throw new MalformedURLException("Unable to find user in " + url); } -// @Override -// public URL sanitizeURL(URL url) throws MalformedURLException { -// Pattern p = Pattern.compile("^.*instagram\\.com/([a-zA-Z0-9\\-_.]+).*$"); -// Matcher m = p.matcher(url.toExternalForm()); -// if (m.matches()) { -// return new URL("http://instagram.com/" + m.group(1)); -// } -// -// throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url); -// } - - private String getUserID(URL url) throws IOException { - - Pattern p = Pattern.compile("^https?://instagram\\.com/([^/]+)"); - Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(1); - } - - p = Pattern.compile("^https?://www.instagram.com/([^/]+)/?"); - m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(1); - } - - p = Pattern.compile("^https?://(www.)?instagram.com/p/[a-zA-Z0-9_-]+/\\?taken-by=([^/]+)"); - m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(1); - } - - throw new IOException("Unable to find userID at " + this.url); - } private JSONObject getJSONFromPage(Document firstPage) throws IOException { String jsonText = ""; try { @@ -115,7 +82,7 @@ public class InstagramRipper extends AbstractHTMLRipper { @Override public Document getFirstPage() throws IOException { - userID = getUserID(url); + userID = getGID(url); return Http.url(url).get(); } @@ -131,6 +98,7 @@ public class InstagramRipper extends AbstractHTMLRipper { private String getOriginalUrl(String imageURL) { imageURL = imageURL.replaceAll("scontent.cdninstagram.com/hphotos-", "igcdn-photos-d-a.akamaihd.net/hphotos-ak-"); + // TODO replace this with a single regex imageURL = imageURL.replaceAll("p150x150/", ""); imageURL = imageURL.replaceAll("p320x320/", ""); imageURL = imageURL.replaceAll("p480x480/", "");