Merge pull request #478 from cyian-1756/igFix

IG ripper can now rip from tags; fixed json parsing issues
This commit is contained in:
cyian-1756 2018-03-17 16:08:39 -04:00 committed by GitHub
commit 2beecc7829
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -29,6 +29,8 @@ import com.rarchives.ripme.utils.Utils;
public class InstagramRipper extends AbstractHTMLRipper { public class InstagramRipper extends AbstractHTMLRipper {
String nextPageID = ""; String nextPageID = "";
private String qHash; private String qHash;
private boolean rippingTag = false;
private String tagName;
private String userID; private String userID;
@ -131,11 +133,13 @@ public class InstagramRipper extends AbstractHTMLRipper {
return m.group(1); return m.group(1);
} }
// p = Pattern.compile("^https?://www.instagram.com/explore/tags/([^/]+)/?"); p = Pattern.compile("^https?://www.instagram.com/explore/tags/([^/]+)/?");
// m = p.matcher(url.toExternalForm()); m = p.matcher(url.toExternalForm());
// if (m.matches()) { if (m.matches()) {
// return m.group(1); rippingTag = true;
// } tagName = m.group(1);
return m.group(1);
}
throw new MalformedURLException("Unable to find user in " + url); throw new MalformedURLException("Unable to find user in " + url);
} }
@ -146,6 +150,8 @@ public class InstagramRipper extends AbstractHTMLRipper {
" <body>", ""); " <body>", "");
t.replaceAll("</body>\n" + t.replaceAll("</body>\n" +
"</html>", ""); "</html>", "");
t = t.replaceAll("\n", "");
t = t.replaceAll("=\"\"", "");
return t; return t;
} }
@ -230,15 +236,26 @@ public class InstagramRipper extends AbstractHTMLRipper {
if (!url.toExternalForm().contains("/p/")) { if (!url.toExternalForm().contains("/p/")) {
JSONArray datas = new JSONArray(); JSONArray datas = new JSONArray();
// This first try only works on data from the first page if (!rippingTag) {
try { // This first try only works on data from the first page
JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage"); try {
userID = profilePage.getJSONObject(0).getString("logging_page_id").replaceAll("profilePage_", ""); JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage");
datas = profilePage.getJSONObject(0).getJSONObject("graphql").getJSONObject("user") userID = profilePage.getJSONObject(0).getString("logging_page_id").replaceAll("profilePage_", "");
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges"); datas = profilePage.getJSONObject(0).getJSONObject("graphql").getJSONObject("user")
} catch (JSONException e) { .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges");
datas = json.getJSONObject("data").getJSONObject("user") } catch (JSONException e) {
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges"); datas = json.getJSONObject("data").getJSONObject("user")
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges");
}
} else {
try {
JSONArray tagPage = json.getJSONObject("entry_data").getJSONArray("TagPage");
datas = tagPage.getJSONObject(0).getJSONObject("graphql").getJSONObject("hashtag")
.getJSONObject("edge_hashtag_to_media").getJSONArray("edges");
} catch (JSONException e) {
datas = json.getJSONObject("data").getJSONObject("hashtag").getJSONObject("edge_hashtag_to_media")
.getJSONArray("edges");
}
} }
for (int i = 0; i < datas.length(); i++) { for (int i = 0; i < datas.length(); i++) {
JSONObject data = (JSONObject) datas.get(i); JSONObject data = (JSONObject) datas.get(i);
@ -246,17 +263,20 @@ public class InstagramRipper extends AbstractHTMLRipper {
Long epoch = data.getLong("taken_at_timestamp"); Long epoch = data.getLong("taken_at_timestamp");
Instant instant = Instant.ofEpochSecond(epoch); Instant instant = Instant.ofEpochSecond(epoch);
String image_date = DateTimeFormatter.ofPattern("yyyy_MM_dd_hh:mm_").format(ZonedDateTime.ofInstant(instant, ZoneOffset.UTC)); String image_date = DateTimeFormatter.ofPattern("yyyy_MM_dd_hh:mm_").format(ZonedDateTime.ofInstant(instant, ZoneOffset.UTC));
if (data.getString("__typename").equals("GraphSidecar")) { // It looks like tag pages don't have the __typename key
try { if (!rippingTag) {
Document slideShowDoc = Http.url(new URL("https://www.instagram.com/p/" + data.getString("shortcode"))).get(); if (data.getString("__typename").equals("GraphSidecar")) {
List<String> toAdd = getPostsFromSinglePage(slideShowDoc); try {
for (int slideShowInt=0; slideShowInt<toAdd.size(); slideShowInt++) { Document slideShowDoc = Http.url(new URL("https://www.instagram.com/p/" + data.getString("shortcode"))).get();
addURLToDownload(new URL(toAdd.get(slideShowInt)), image_date + data.getString("shortcode")); List<String> toAdd = getPostsFromSinglePage(slideShowDoc);
for (int slideShowInt = 0; slideShowInt < toAdd.size(); slideShowInt++) {
addURLToDownload(new URL(toAdd.get(slideShowInt)), image_date + data.getString("shortcode"));
}
} catch (MalformedURLException e) {
logger.error("Unable to download slide show, URL was malformed");
} catch (IOException e) {
logger.error("Unable to download slide show");
} }
} catch (MalformedURLException e) {
logger.error("Unable to download slide show, URL was malformed");
} catch (IOException e) {
logger.error("Unable to download slide show");
} }
} }
try { try {
@ -297,15 +317,12 @@ public class InstagramRipper extends AbstractHTMLRipper {
public Document getNextPage(Document doc) throws IOException { public Document getNextPage(Document doc) throws IOException {
Document toreturn; Document toreturn;
if (!nextPageID.equals("") && !isThisATest()) { if (!nextPageID.equals("") && !isThisATest()) {
if (url.toExternalForm().contains("/tags/")) { if (rippingTag) {
try { try {
// Sleep for a while to avoid a ban
sleep(2500); sleep(2500);
if (url.toExternalForm().substring(url.toExternalForm().length() - 1).equals("/")) { toreturn = Http.url("https://www.instagram.com/graphql/query/?query_hash=" + qHash +
toreturn = Http.url(url.toExternalForm() + "?max_id=" + nextPageID).ignoreContentType().get(); "&variables={\"tag_name\":\"" + tagName + "\",\"first\":4,\"after\":\"" + nextPageID + "\"}").ignoreContentType().get();
} else { // Sleep for a while to avoid a ban
toreturn = Http.url(url.toExternalForm() + "/?max_id=" + nextPageID).ignoreContentType().get();
}
logger.info(toreturn.html()); logger.info(toreturn.html());
return toreturn; return toreturn;
@ -337,6 +354,7 @@ public class InstagramRipper extends AbstractHTMLRipper {
} }
private boolean pageHasImages(Document doc) { private boolean pageHasImages(Document doc) {
logger.info("BAD DATA: " + stripHTMLTags(doc.html()));
JSONObject json = new JSONObject(stripHTMLTags(doc.html())); JSONObject json = new JSONObject(stripHTMLTags(doc.html()));
int numberOfImages = json.getJSONObject("data").getJSONObject("user") int numberOfImages = json.getJSONObject("data").getJSONObject("user")
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length(); .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length();
@ -369,10 +387,18 @@ public class InstagramRipper extends AbstractHTMLRipper {
logger.info(e.getMessage()); logger.info(e.getMessage());
return null; return null;
} }
Pattern jsP = Pattern.compile("o},queryId:.([a-zA-Z0-9]+)."); if (!rippingTag) {
Matcher m = jsP.matcher(sb.toString()); Pattern jsP = Pattern.compile("o},queryId:.([a-zA-Z0-9]+).");
if (m.find()) { Matcher m = jsP.matcher(sb.toString());
return m.group(1); if (m.find()) {
return m.group(1);
}
} else {
Pattern jsP = Pattern.compile("return e.tagMedia.byTagName.get\\(t\\).pagination},queryId:.([a-zA-Z0-9]+).");
Matcher m = jsP.matcher(sb.toString());
if (m.find()) {
return m.group(1);
}
} }
logger.info("Could not find query_hash on " + jsFileURL); logger.info("Could not find query_hash on " + jsFileURL);
return null; return null;