Added support for ripping single pages from instagram (#239)

* Added support for ripping from single pages

* Removed whitespace

* Instagram folder naming improvments

* Added GID tests for instagram single pages

* Added some album download tests for instagram

* Commented out flaky unit test
This commit is contained in:
cyian-1756 2017-11-21 16:07:25 -05:00 committed by metaprime
parent 8840a2baa1
commit 9d9cf61961
2 changed files with 61 additions and 17 deletions

View File

@ -50,6 +50,42 @@ public class InstagramRipper extends AbstractHTMLRipper {
return san_url; return san_url;
} }
private List<String> getPostsFromSinglePage(Document Doc) {
List<String> imageURLs = new ArrayList<>();
JSONArray datas;
try {
JSONObject json = getJSONFromPage(Doc);
if (json.getJSONObject("entry_data").getJSONArray("PostPage")
.getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media")
.has("edge_sidecar_to_children")) {
datas = json.getJSONObject("entry_data").getJSONArray("PostPage")
.getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media")
.getJSONObject("edge_sidecar_to_children").getJSONArray("edges");
for (int i = 0; i < datas.length(); i++) {
JSONObject data = (JSONObject) datas.get(i);
data = data.getJSONObject("node");
if (data.has("is_video") && data.getBoolean("is_video")) {
imageURLs.add(data.getString("video_url"));
} else {
imageURLs.add(data.getString("display_url"));
}
}
} else {
JSONObject data = json.getJSONObject("entry_data").getJSONArray("PostPage")
.getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media");
if (data.getBoolean("is_video")) {
imageURLs.add(data.getString("video_url"));
} else {
imageURLs.add(data.getString("display_url"));
}
}
return imageURLs;
} catch (IOException e) {
logger.error("Unable to get JSON from page " + url.toExternalForm());
return null;
}
}
@Override @Override
public String getGID(URL url) throws MalformedURLException { public String getGID(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://instagram.com/([^/]+)/?"); Pattern p = Pattern.compile("^https?://instagram.com/([^/]+)/?");
@ -64,7 +100,19 @@ public class InstagramRipper extends AbstractHTMLRipper {
return m.group(1); return m.group(1);
} }
p = Pattern.compile("^https?://www.instagram.com/p/[a-zA-Z0-9_-]+/\\?taken-by=([^/]+)/?"); p = Pattern.compile("^https?://www.instagram.com/p/([a-zA-Z0-9_-]+)/\\?taken-by=([^/]+)/?");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
return m.group(2) + "_" + m.group(1);
}
p = Pattern.compile("^https?://www.instagram.com/p/([a-zA-Z0-9_-]+)/?");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
return m.group(1);
}
p = Pattern.compile("^https?://www.instagram.com/p/([a-zA-Z0-9_-]+)/?(?:\\?hl=\\S*)?/?");
m = p.matcher(url.toExternalForm()); m = p.matcher(url.toExternalForm());
if (m.matches()) { if (m.matches()) {
return m.group(1); return m.group(1);
@ -148,9 +196,8 @@ public class InstagramRipper extends AbstractHTMLRipper {
logger.warn("Unable to exact json from page"); logger.warn("Unable to exact json from page");
} }
Pattern p = Pattern.compile("^.*instagram.com/p/([a-zA-Z0-9\\-_.]+)/?");
Matcher m = p.matcher(url.toExternalForm()); if (!url.toExternalForm().contains("/p/")) {
if (!m.matches()) {
JSONArray datas = new JSONArray(); JSONArray datas = new JSONArray();
try { try {
JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage"); JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage");
@ -216,16 +263,9 @@ public class InstagramRipper extends AbstractHTMLRipper {
} }
} else { // We're ripping from a single page } else { // We're ripping from a single page
logger.info("Ripping from single page"); logger.info("Ripping from single page");
if (!doc.select("meta[property=og:video]").attr("content").equals("")) { imageURLs = getPostsFromSinglePage(doc);
String videoURL = doc.select("meta[property=og:video]").attr("content");
// We're ripping a page with a video on it
imageURLs.add(videoURL);
} else {
// We're ripping a picture
imageURLs.add(doc.select("meta[property=og:image]").attr("content"));
}
} }
return imageURLs; return imageURLs;
} }

View File

@ -15,6 +15,10 @@ public class InstagramRipperTest extends RippersTest {
Map<URL, String> testURLs = new HashMap<>(); Map<URL, String> testURLs = new HashMap<>();
testURLs.put(new URL("http://instagram.com/Test_User"), "Test_User"); testURLs.put(new URL("http://instagram.com/Test_User"), "Test_User");
testURLs.put(new URL("http://instagram.com/_test_user_"), "_test_user_"); testURLs.put(new URL("http://instagram.com/_test_user_"), "_test_user_");
testURLs.put(new URL("https://www.instagram.com/p/BZ4egP7njW5/?hl=en"), "BZ4egP7njW5");
testURLs.put(new URL("https://www.instagram.com/p/BZ4egP7njW5"), "BZ4egP7njW5");
testURLs.put(new URL("https://www.instagram.com/p/BaNPpaHn2zU/?taken-by=hilaryduff"), "hilaryduff_BaNPpaHn2zU");
testURLs.put(new URL("https://www.instagram.com/p/BaNPpaHn2zU/"), "BaNPpaHn2zU");
for (URL url : testURLs.keySet()) { for (URL url : testURLs.keySet()) {
InstagramRipper ripper = new InstagramRipper(url); InstagramRipper ripper = new InstagramRipper(url);
ripper.setup(); ripper.setup();
@ -23,15 +27,15 @@ public class InstagramRipperTest extends RippersTest {
} }
} }
/*
public void testInstagramAlbums() throws IOException { public void testInstagramAlbums() throws IOException {
List<URL> contentURLs = new ArrayList<>(); List<URL> contentURLs = new ArrayList<>();
contentURLs.add(new URL("http://instagram.com/anacheri")); // This unit test is a bit flaky
//contentURLs.add(new URL("https://www.instagram.com/Test_User/"));
contentURLs.add(new URL("https://www.instagram.com/p/BZ4egP7njW5/?hl=en"));
contentURLs.add(new URL("https://www.instagram.com/p/BaNPpaHn2zU/"));
for (URL url : contentURLs) { for (URL url : contentURLs) {
InstagramRipper ripper = new InstagramRipper(url); InstagramRipper ripper = new InstagramRipper(url);
testRipper(ripper); testRipper(ripper);
} }
} }
*/
} }