Added support for ripping single pages from instagram (#239)
* Added support for ripping from single pages * Removed whitespace * Instagram folder naming improvments * Added GID tests for instagram single pages * Added some album download tests for instagram * Commented out flaky unit test
This commit is contained in:
parent
8840a2baa1
commit
9d9cf61961
@ -50,6 +50,42 @@ public class InstagramRipper extends AbstractHTMLRipper {
|
|||||||
return san_url;
|
return san_url;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<String> getPostsFromSinglePage(Document Doc) {
|
||||||
|
List<String> imageURLs = new ArrayList<>();
|
||||||
|
JSONArray datas;
|
||||||
|
try {
|
||||||
|
JSONObject json = getJSONFromPage(Doc);
|
||||||
|
if (json.getJSONObject("entry_data").getJSONArray("PostPage")
|
||||||
|
.getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media")
|
||||||
|
.has("edge_sidecar_to_children")) {
|
||||||
|
datas = json.getJSONObject("entry_data").getJSONArray("PostPage")
|
||||||
|
.getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media")
|
||||||
|
.getJSONObject("edge_sidecar_to_children").getJSONArray("edges");
|
||||||
|
for (int i = 0; i < datas.length(); i++) {
|
||||||
|
JSONObject data = (JSONObject) datas.get(i);
|
||||||
|
data = data.getJSONObject("node");
|
||||||
|
if (data.has("is_video") && data.getBoolean("is_video")) {
|
||||||
|
imageURLs.add(data.getString("video_url"));
|
||||||
|
} else {
|
||||||
|
imageURLs.add(data.getString("display_url"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
JSONObject data = json.getJSONObject("entry_data").getJSONArray("PostPage")
|
||||||
|
.getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media");
|
||||||
|
if (data.getBoolean("is_video")) {
|
||||||
|
imageURLs.add(data.getString("video_url"));
|
||||||
|
} else {
|
||||||
|
imageURLs.add(data.getString("display_url"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return imageURLs;
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("Unable to get JSON from page " + url.toExternalForm());
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getGID(URL url) throws MalformedURLException {
|
public String getGID(URL url) throws MalformedURLException {
|
||||||
Pattern p = Pattern.compile("^https?://instagram.com/([^/]+)/?");
|
Pattern p = Pattern.compile("^https?://instagram.com/([^/]+)/?");
|
||||||
@ -64,7 +100,19 @@ public class InstagramRipper extends AbstractHTMLRipper {
|
|||||||
return m.group(1);
|
return m.group(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
p = Pattern.compile("^https?://www.instagram.com/p/[a-zA-Z0-9_-]+/\\?taken-by=([^/]+)/?");
|
p = Pattern.compile("^https?://www.instagram.com/p/([a-zA-Z0-9_-]+)/\\?taken-by=([^/]+)/?");
|
||||||
|
m = p.matcher(url.toExternalForm());
|
||||||
|
if (m.matches()) {
|
||||||
|
return m.group(2) + "_" + m.group(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
p = Pattern.compile("^https?://www.instagram.com/p/([a-zA-Z0-9_-]+)/?");
|
||||||
|
m = p.matcher(url.toExternalForm());
|
||||||
|
if (m.matches()) {
|
||||||
|
return m.group(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
p = Pattern.compile("^https?://www.instagram.com/p/([a-zA-Z0-9_-]+)/?(?:\\?hl=\\S*)?/?");
|
||||||
m = p.matcher(url.toExternalForm());
|
m = p.matcher(url.toExternalForm());
|
||||||
if (m.matches()) {
|
if (m.matches()) {
|
||||||
return m.group(1);
|
return m.group(1);
|
||||||
@ -148,9 +196,8 @@ public class InstagramRipper extends AbstractHTMLRipper {
|
|||||||
logger.warn("Unable to exact json from page");
|
logger.warn("Unable to exact json from page");
|
||||||
}
|
}
|
||||||
|
|
||||||
Pattern p = Pattern.compile("^.*instagram.com/p/([a-zA-Z0-9\\-_.]+)/?");
|
|
||||||
Matcher m = p.matcher(url.toExternalForm());
|
if (!url.toExternalForm().contains("/p/")) {
|
||||||
if (!m.matches()) {
|
|
||||||
JSONArray datas = new JSONArray();
|
JSONArray datas = new JSONArray();
|
||||||
try {
|
try {
|
||||||
JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage");
|
JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage");
|
||||||
@ -216,16 +263,9 @@ public class InstagramRipper extends AbstractHTMLRipper {
|
|||||||
}
|
}
|
||||||
} else { // We're ripping from a single page
|
} else { // We're ripping from a single page
|
||||||
logger.info("Ripping from single page");
|
logger.info("Ripping from single page");
|
||||||
if (!doc.select("meta[property=og:video]").attr("content").equals("")) {
|
imageURLs = getPostsFromSinglePage(doc);
|
||||||
String videoURL = doc.select("meta[property=og:video]").attr("content");
|
|
||||||
// We're ripping a page with a video on it
|
|
||||||
imageURLs.add(videoURL);
|
|
||||||
} else {
|
|
||||||
// We're ripping a picture
|
|
||||||
imageURLs.add(doc.select("meta[property=og:image]").attr("content"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
return imageURLs;
|
return imageURLs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,6 +15,10 @@ public class InstagramRipperTest extends RippersTest {
|
|||||||
Map<URL, String> testURLs = new HashMap<>();
|
Map<URL, String> testURLs = new HashMap<>();
|
||||||
testURLs.put(new URL("http://instagram.com/Test_User"), "Test_User");
|
testURLs.put(new URL("http://instagram.com/Test_User"), "Test_User");
|
||||||
testURLs.put(new URL("http://instagram.com/_test_user_"), "_test_user_");
|
testURLs.put(new URL("http://instagram.com/_test_user_"), "_test_user_");
|
||||||
|
testURLs.put(new URL("https://www.instagram.com/p/BZ4egP7njW5/?hl=en"), "BZ4egP7njW5");
|
||||||
|
testURLs.put(new URL("https://www.instagram.com/p/BZ4egP7njW5"), "BZ4egP7njW5");
|
||||||
|
testURLs.put(new URL("https://www.instagram.com/p/BaNPpaHn2zU/?taken-by=hilaryduff"), "hilaryduff_BaNPpaHn2zU");
|
||||||
|
testURLs.put(new URL("https://www.instagram.com/p/BaNPpaHn2zU/"), "BaNPpaHn2zU");
|
||||||
for (URL url : testURLs.keySet()) {
|
for (URL url : testURLs.keySet()) {
|
||||||
InstagramRipper ripper = new InstagramRipper(url);
|
InstagramRipper ripper = new InstagramRipper(url);
|
||||||
ripper.setup();
|
ripper.setup();
|
||||||
@ -23,15 +27,15 @@ public class InstagramRipperTest extends RippersTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
public void testInstagramAlbums() throws IOException {
|
public void testInstagramAlbums() throws IOException {
|
||||||
List<URL> contentURLs = new ArrayList<>();
|
List<URL> contentURLs = new ArrayList<>();
|
||||||
contentURLs.add(new URL("http://instagram.com/anacheri"));
|
// This unit test is a bit flaky
|
||||||
|
//contentURLs.add(new URL("https://www.instagram.com/Test_User/"));
|
||||||
|
contentURLs.add(new URL("https://www.instagram.com/p/BZ4egP7njW5/?hl=en"));
|
||||||
|
contentURLs.add(new URL("https://www.instagram.com/p/BaNPpaHn2zU/"));
|
||||||
for (URL url : contentURLs) {
|
for (URL url : contentURLs) {
|
||||||
InstagramRipper ripper = new InstagramRipper(url);
|
InstagramRipper ripper = new InstagramRipper(url);
|
||||||
testRipper(ripper);
|
testRipper(ripper);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user