From 9d9cf61961fd9022155efc57c8f8b647ee103196 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Tue, 21 Nov 2017 16:07:25 -0500 Subject: [PATCH] Added support for ripping single pages from instagram (#239) * Added support for ripping from single pages * Removed whitespace * Instagram folder naming improvments * Added GID tests for instagram single pages * Added some album download tests for instagram * Commented out flaky unit test --- .../ripme/ripper/rippers/InstagramRipper.java | 66 +++++++++++++++---- .../ripper/rippers/InstagramRipperTest.java | 12 ++-- 2 files changed, 61 insertions(+), 17 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index 564227f6..337b658d 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -50,6 +50,42 @@ public class InstagramRipper extends AbstractHTMLRipper { return san_url; } + private List getPostsFromSinglePage(Document Doc) { + List imageURLs = new ArrayList<>(); + JSONArray datas; + try { + JSONObject json = getJSONFromPage(Doc); + if (json.getJSONObject("entry_data").getJSONArray("PostPage") + .getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media") + .has("edge_sidecar_to_children")) { + datas = json.getJSONObject("entry_data").getJSONArray("PostPage") + .getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media") + .getJSONObject("edge_sidecar_to_children").getJSONArray("edges"); + for (int i = 0; i < datas.length(); i++) { + JSONObject data = (JSONObject) datas.get(i); + data = data.getJSONObject("node"); + if (data.has("is_video") && data.getBoolean("is_video")) { + imageURLs.add(data.getString("video_url")); + } else { + imageURLs.add(data.getString("display_url")); + } + } + } else { + JSONObject data = json.getJSONObject("entry_data").getJSONArray("PostPage") + .getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media"); + if (data.getBoolean("is_video")) { + imageURLs.add(data.getString("video_url")); + } else { + imageURLs.add(data.getString("display_url")); + } + } + return imageURLs; + } catch (IOException e) { + logger.error("Unable to get JSON from page " + url.toExternalForm()); + return null; + } + } + @Override public String getGID(URL url) throws MalformedURLException { Pattern p = Pattern.compile("^https?://instagram.com/([^/]+)/?"); @@ -64,7 +100,19 @@ public class InstagramRipper extends AbstractHTMLRipper { return m.group(1); } - p = Pattern.compile("^https?://www.instagram.com/p/[a-zA-Z0-9_-]+/\\?taken-by=([^/]+)/?"); + p = Pattern.compile("^https?://www.instagram.com/p/([a-zA-Z0-9_-]+)/\\?taken-by=([^/]+)/?"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(2) + "_" + m.group(1); + } + + p = Pattern.compile("^https?://www.instagram.com/p/([a-zA-Z0-9_-]+)/?"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1); + } + + p = Pattern.compile("^https?://www.instagram.com/p/([a-zA-Z0-9_-]+)/?(?:\\?hl=\\S*)?/?"); m = p.matcher(url.toExternalForm()); if (m.matches()) { return m.group(1); @@ -148,9 +196,8 @@ public class InstagramRipper extends AbstractHTMLRipper { logger.warn("Unable to exact json from page"); } - Pattern p = Pattern.compile("^.*instagram.com/p/([a-zA-Z0-9\\-_.]+)/?"); - Matcher m = p.matcher(url.toExternalForm()); - if (!m.matches()) { + + if (!url.toExternalForm().contains("/p/")) { JSONArray datas = new JSONArray(); try { JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage"); @@ -216,16 +263,9 @@ public class InstagramRipper extends AbstractHTMLRipper { } } else { // We're ripping from a single page logger.info("Ripping from single page"); - if (!doc.select("meta[property=og:video]").attr("content").equals("")) { - String videoURL = doc.select("meta[property=og:video]").attr("content"); - // We're ripping a page with a video on it - imageURLs.add(videoURL); - } else { - // We're ripping a picture - imageURLs.add(doc.select("meta[property=og:image]").attr("content")); - } - + imageURLs = getPostsFromSinglePage(doc); } + return imageURLs; } diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java index fd989b92..cf9c05e2 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java @@ -15,6 +15,10 @@ public class InstagramRipperTest extends RippersTest { Map testURLs = new HashMap<>(); testURLs.put(new URL("http://instagram.com/Test_User"), "Test_User"); testURLs.put(new URL("http://instagram.com/_test_user_"), "_test_user_"); + testURLs.put(new URL("https://www.instagram.com/p/BZ4egP7njW5/?hl=en"), "BZ4egP7njW5"); + testURLs.put(new URL("https://www.instagram.com/p/BZ4egP7njW5"), "BZ4egP7njW5"); + testURLs.put(new URL("https://www.instagram.com/p/BaNPpaHn2zU/?taken-by=hilaryduff"), "hilaryduff_BaNPpaHn2zU"); + testURLs.put(new URL("https://www.instagram.com/p/BaNPpaHn2zU/"), "BaNPpaHn2zU"); for (URL url : testURLs.keySet()) { InstagramRipper ripper = new InstagramRipper(url); ripper.setup(); @@ -23,15 +27,15 @@ public class InstagramRipperTest extends RippersTest { } } - /* public void testInstagramAlbums() throws IOException { List contentURLs = new ArrayList<>(); - contentURLs.add(new URL("http://instagram.com/anacheri")); + // This unit test is a bit flaky + //contentURLs.add(new URL("https://www.instagram.com/Test_User/")); + contentURLs.add(new URL("https://www.instagram.com/p/BZ4egP7njW5/?hl=en")); + contentURLs.add(new URL("https://www.instagram.com/p/BaNPpaHn2zU/")); for (URL url : contentURLs) { InstagramRipper ripper = new InstagramRipper(url); testRipper(ripper); } } - */ - }