Merge pull request #222 from cyian-1756/ig_hashtag

Added support for ripping from IG tags
This commit is contained in:
cyian-1756 2017-11-18 09:16:21 -05:00 committed by GitHub
commit ecf72e517e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -69,6 +69,13 @@ public class InstagramRipper extends AbstractHTMLRipper {
if (m.matches()) { if (m.matches()) {
return m.group(1); return m.group(1);
} }
p = Pattern.compile("^https?://www.instagram.com/explore/tags/([^/]+)/?");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
return m.group(1);
}
throw new MalformedURLException("Unable to find user in " + url); throw new MalformedURLException("Unable to find user in " + url);
} }
@ -141,11 +148,18 @@ public class InstagramRipper extends AbstractHTMLRipper {
logger.warn("Unable to exact json from page"); logger.warn("Unable to exact json from page");
} }
Pattern p = Pattern.compile("^.*instagram\\.com/([a-zA-Z0-9\\-_.]+)/?"); Pattern p = Pattern.compile("^.*instagram.com/p/([a-zA-Z0-9\\-_.]+)/?");
Matcher m = p.matcher(url.toExternalForm()); Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) { if (!m.matches()) {
JSONArray datas = new JSONArray();
try {
JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage"); JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage");
JSONArray datas = profilePage.getJSONObject(0).getJSONObject("user").getJSONObject("media").getJSONArray("nodes"); datas = profilePage.getJSONObject(0).getJSONObject("user").getJSONObject("media").getJSONArray("nodes");
} catch (JSONException e) {
// Handle hashtag pages
datas = json.getJSONObject("entry_data").getJSONArray("TagPage").getJSONObject(0)
.getJSONObject("tag").getJSONObject("media").getJSONArray("nodes");
}
for (int i = 0; i < datas.length(); i++) { for (int i = 0; i < datas.length(); i++) {
JSONObject data = (JSONObject) datas.get(i); JSONObject data = (JSONObject) datas.get(i);
Long epoch = data.getLong("date"); Long epoch = data.getLong("date");
@ -175,6 +189,21 @@ public class InstagramRipper extends AbstractHTMLRipper {
} }
// Rip the next page // Rip the next page
if (!nextPageID.equals("") && !isThisATest()) { if (!nextPageID.equals("") && !isThisATest()) {
if (url.toExternalForm().contains("/tags/")) {
try {
// Sleep for a while to avoid a ban
sleep(2500);
if (url.toExternalForm().substring(url.toExternalForm().length() - 1).equals("/")) {
getURLsFromPage(Http.url(url.toExternalForm() + "?max_id=" + nextPageID).get());
} else {
getURLsFromPage(Http.url(url.toExternalForm() + "/?max_id=" + nextPageID).get());
}
} catch (IOException e) {
return imageURLs;
}
}
try { try {
// Sleep for a while to avoid a ban // Sleep for a while to avoid a ban
sleep(2500); sleep(2500);
@ -182,8 +211,11 @@ public class InstagramRipper extends AbstractHTMLRipper {
} catch (IOException e) { } catch (IOException e) {
return imageURLs; return imageURLs;
} }
} else {
logger.warn("Can't get net page");
} }
} else { // We're ripping from a single page } else { // We're ripping from a single page
logger.info("Ripping from single page");
if (!doc.select("meta[property=og:video]").attr("content").equals("")) { if (!doc.select("meta[property=og:video]").attr("content").equals("")) {
String videoURL = doc.select("meta[property=og:video]").attr("content"); String videoURL = doc.select("meta[property=og:video]").attr("content");
// We're ripping a page with a video on it // We're ripping a page with a video on it