Insagram ripper now can get all pages of a profile

This commit is contained in:
cyian-1756 2018-03-15 13:18:29 -04:00
parent 7356a13da1
commit 545bfce7c9

View File

@ -1,8 +1,11 @@
package com.rarchives.ripme.ripper.rippers; package com.rarchives.ripme.ripper.rippers;
import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.net.URLConnection;
import java.time.*; import java.time.*;
import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatter;
import java.util.ArrayList; import java.util.ArrayList;
@ -25,6 +28,7 @@ import com.rarchives.ripme.utils.Utils;
public class InstagramRipper extends AbstractHTMLRipper { public class InstagramRipper extends AbstractHTMLRipper {
String nextPageID = ""; String nextPageID = "";
private String qHash;
private String userID; private String userID;
@ -136,7 +140,21 @@ public class InstagramRipper extends AbstractHTMLRipper {
throw new MalformedURLException("Unable to find user in " + url); throw new MalformedURLException("Unable to find user in " + url);
} }
private String stripHTMLTags(String t) {
t = t.replaceAll("<html>\n" +
" <head></head>\n" +
" <body>", "");
t.replaceAll("</body>\n" +
"</html>", "");
return t;
}
private JSONObject getJSONFromPage(Document firstPage) throws IOException { private JSONObject getJSONFromPage(Document firstPage) throws IOException {
// Check if this page is HTML + JSON or jsut json
if (!firstPage.html().contains("window._sharedData =")) {
return new JSONObject(stripHTMLTags(firstPage.html()));
}
String jsonText = ""; String jsonText = "";
try { try {
for (Element script : firstPage.select("script[type=text/javascript]")) { for (Element script : firstPage.select("script[type=text/javascript]")) {
@ -153,8 +171,10 @@ public class InstagramRipper extends AbstractHTMLRipper {
@Override @Override
public Document getFirstPage() throws IOException { public Document getFirstPage() throws IOException {
userID = getGID(url); Document p = Http.url(url).get();
return Http.url(url).get(); // Get the query hash so we can download the next page
qHash = getQHash(p);
return p;
} }
private String getVideoFromPage(String videoID) { private String getVideoFromPage(String videoID) {
@ -210,14 +230,15 @@ public class InstagramRipper extends AbstractHTMLRipper {
if (!url.toExternalForm().contains("/p/")) { if (!url.toExternalForm().contains("/p/")) {
JSONArray datas = new JSONArray(); JSONArray datas = new JSONArray();
// This first try only works on data from the first page
try { try {
JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage"); JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage");
userID = profilePage.getJSONObject(0).getString("logging_page_id").replaceAll("profilePage_", "");
datas = profilePage.getJSONObject(0).getJSONObject("graphql").getJSONObject("user") datas = profilePage.getJSONObject(0).getJSONObject("graphql").getJSONObject("user")
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges"); .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges");
} catch (JSONException e) { } catch (JSONException e) {
// Handle hashtag pages datas = json.getJSONObject("data").getJSONObject("user")
datas = json.getJSONObject("entry_data").getJSONArray("TagPage").getJSONObject(0) .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges");
.getJSONObject("tag").getJSONObject("media").getJSONArray("nodes");
} }
for (int i = 0; i < datas.length(); i++) { for (int i = 0; i < datas.length(); i++) {
JSONObject data = (JSONObject) datas.get(i); JSONObject data = (JSONObject) datas.get(i);
@ -281,14 +302,11 @@ public class InstagramRipper extends AbstractHTMLRipper {
// Sleep for a while to avoid a ban // Sleep for a while to avoid a ban
sleep(2500); sleep(2500);
if (url.toExternalForm().substring(url.toExternalForm().length() - 1).equals("/")) { if (url.toExternalForm().substring(url.toExternalForm().length() - 1).equals("/")) {
toreturn = Http.url(url.toExternalForm() + "?max_id=" + nextPageID).get(); toreturn = Http.url(url.toExternalForm() + "?max_id=" + nextPageID).ignoreContentType().get();
} else { } else {
toreturn = Http.url(url.toExternalForm() + "/?max_id=" + nextPageID).get(); toreturn = Http.url(url.toExternalForm() + "/?max_id=" + nextPageID).ignoreContentType().get();
} }
logger.info(toreturn.html()); logger.info(toreturn.html());
if (!hasImage(toreturn)) {
throw new IOException("No more pages");
}
return toreturn; return toreturn;
} catch (IOException e) { } catch (IOException e) {
@ -299,8 +317,9 @@ public class InstagramRipper extends AbstractHTMLRipper {
try { try {
// Sleep for a while to avoid a ban // Sleep for a while to avoid a ban
sleep(2500); sleep(2500);
toreturn = Http.url("https://www.instagram.com/" + userID + "/?max_id=" + nextPageID).get(); toreturn = Http.url("https://www.instagram.com/graphql/query/?query_hash=" + qHash + "&variables=" +
if (!hasImage(toreturn)) { "{\"id\":\"" + userID + "\",\"first\":100,\"after\":\"" + nextPageID + "\"}").ignoreContentType().get();
if (!pageHasImages(toreturn)) {
throw new IOException("No more pages"); throw new IOException("No more pages");
} }
return toreturn; return toreturn;
@ -317,20 +336,46 @@ public class InstagramRipper extends AbstractHTMLRipper {
addURLToDownload(url); addURLToDownload(url);
} }
private boolean hasImage(Document doc) { private boolean pageHasImages(Document doc) {
try { JSONObject json = new JSONObject(stripHTMLTags(doc.html()));
JSONObject json = getJSONFromPage(doc); int numberOfImages = json.getJSONObject("data").getJSONObject("user")
JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage"); .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length();
JSONArray datas = profilePage.getJSONObject(0).getJSONObject("graphql").getJSONObject("user") if (numberOfImages == 0) {
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges");
logger.info(datas.length());
if (datas.length() == 0) {
return false; return false;
} }
return true; return true;
}
private String getQHash(Document doc) {
String jsFileURL = "https://www.instagram.com" + doc.select("link[rel=preload]").attr("href");
StringBuilder sb = new StringBuilder();
Document jsPage;
try {
// We can't use Jsoup here because it won't download a non-html file larger than a MB
// even if you set maxBodySize to 0
URLConnection connection = new URL(jsFileURL).openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
sb.append(line);
}
in.close();
} catch (MalformedURLException e) {
logger.info("Unable to get query_hash, " + jsFileURL + " is a malformed URL");
return null;
} catch (IOException e) { } catch (IOException e) {
return false; logger.info("Unable to get query_hash");
logger.info(e.getMessage());
return null;
} }
Pattern jsP = Pattern.compile("o},queryId:.([a-zA-Z0-9]+).");
Matcher m = jsP.matcher(sb.toString());
if (m.find()) {
return m.group(1);
}
logger.info("Could not find query_hash on " + jsFileURL);
return null;
} }