Merge pull request #504 from cyian-1756/ig-nextpage-fix

Ig nextpage fix
This commit is contained in:
cyian-1756 2018-04-10 18:51:22 -04:00 committed by GitHub
commit d5e89d90c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -3,6 +3,7 @@ package com.rarchives.ripme.ripper.rippers;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.net.URLConnection; import java.net.URLConnection;
@ -12,6 +13,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.security.*;
import org.json.JSONArray; import org.json.JSONArray;
import org.json.JSONException; import org.json.JSONException;
@ -20,6 +22,7 @@ import org.json.JSONObject;
import com.rarchives.ripme.ripper.AbstractHTMLRipper; import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Http;
import org.jsoup.Connection;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import com.rarchives.ripme.ui.RipStatusMessage; import com.rarchives.ripme.ui.RipStatusMessage;
@ -34,6 +37,10 @@ public class InstagramRipper extends AbstractHTMLRipper {
private String tagName; private String tagName;
private String userID; private String userID;
private String rhx_gis = null;
private String csrftoken;
public InstagramRipper(URL url) throws IOException { public InstagramRipper(URL url) throws IOException {
super(url); super(url);
@ -178,7 +185,10 @@ public class InstagramRipper extends AbstractHTMLRipper {
@Override @Override
public Document getFirstPage() throws IOException { public Document getFirstPage() throws IOException {
Document p = Http.url(url).get(); Connection.Response resp = Http.url(url).response();
logger.info(resp.cookies());
csrftoken = resp.cookie("csrftoken");
Document p = resp.parse();
// Get the query hash so we can download the next page // Get the query hash so we can download the next page
qHash = getQHash(p); qHash = getQHash(p);
return p; return p;
@ -234,7 +244,10 @@ public class InstagramRipper extends AbstractHTMLRipper {
logger.warn("Unable to exact json from page"); logger.warn("Unable to exact json from page");
} }
// get the rhx_gis value so we can get the next page later on
if (rhx_gis == null) {
rhx_gis = json.getString("rhx_gis");
}
if (!url.toExternalForm().contains("/p/")) { if (!url.toExternalForm().contains("/p/")) {
JSONArray datas = new JSONArray(); JSONArray datas = new JSONArray();
if (!rippingTag) { if (!rippingTag) {
@ -314,18 +327,41 @@ public class InstagramRipper extends AbstractHTMLRipper {
return imageURLs; return imageURLs;
} }
private String getIGGis(String variables) {
String stringToMD5 = rhx_gis + ":" + csrftoken + ":" + USER_AGENT + ":" + variables;
logger.debug("String to md5 is \"" + stringToMD5 + "\"");
try {
byte[] bytesOfMessage = stringToMD5.getBytes("UTF-8");
MessageDigest md = MessageDigest.getInstance("MD5");
byte[] hash = md.digest(bytesOfMessage);
StringBuffer sb = new StringBuffer();
for (int i = 0; i < hash.length; ++i) {
sb.append(Integer.toHexString((hash[i] & 0xFF) | 0x100).substring(1,3));
}
return sb.toString();
} catch(UnsupportedEncodingException e) {
return null;
} catch(NoSuchAlgorithmException e) {
return null;
}
}
@Override @Override
public Document getNextPage(Document doc) throws IOException { public Document getNextPage(Document doc) throws IOException {
Document toreturn; Document toreturn;
java.util.Map<String, String> cookies = new HashMap<String, String>(); java.util.Map<String, String> cookies = new HashMap<String, String>();
// This shouldn't be hardcoded and will break one day // This shouldn't be hardcoded and will break one day
cookies.put("ig_pr", "1"); cookies.put("ig_pr", "1");
cookies.put("csrftoken", csrftoken);
if (!nextPageID.equals("") && !isThisATest()) { if (!nextPageID.equals("") && !isThisATest()) {
if (rippingTag) { if (rippingTag) {
try { try {
sleep(2500); sleep(2500);
String vars = "{\"tag_name\":\"" + tagName + "\",\"first\":4,\"after\":\"" + nextPageID + "\"}";
String ig_gis = getIGGis(vars);
toreturn = Http.url("https://www.instagram.com/graphql/query/?query_hash=" + qHash + toreturn = Http.url("https://www.instagram.com/graphql/query/?query_hash=" + qHash +
"&variables={\"tag_name\":\"" + tagName + "\",\"first\":4,\"after\":\"" + nextPageID + "\"}").cookies(cookies).ignoreContentType().get(); "&variables=" + vars).header("x-instagram-gis", ig_gis).cookies(cookies).ignoreContentType().get();
// Sleep for a while to avoid a ban // Sleep for a while to avoid a ban
logger.info(toreturn.html()); logger.info(toreturn.html());
return toreturn; return toreturn;
@ -338,8 +374,11 @@ public class InstagramRipper extends AbstractHTMLRipper {
try { try {
// Sleep for a while to avoid a ban // Sleep for a while to avoid a ban
sleep(2500); sleep(2500);
toreturn = Http.url("https://www.instagram.com/graphql/query/?query_hash=" + qHash + "&variables=" + String vars = "{\"id\":\"" + userID + "\",\"first\":100,\"after\":\"" + nextPageID + "\"}";
"{\"id\":\"" + userID + "\",\"first\":100,\"after\":\"" + nextPageID + "\"}").cookies(cookies).ignoreContentType().get(); String ig_gis = getIGGis(vars);
logger.info(ig_gis);
toreturn = Http.url("https://www.instagram.com/graphql/query/?query_hash=" + qHash + "&variables=" + vars
).header("x-instagram-gis", ig_gis).cookies(cookies).ignoreContentType().get();
if (!pageHasImages(toreturn)) { if (!pageHasImages(toreturn)) {
throw new IOException("No more pages"); throw new IOException("No more pages");
} }
@ -358,7 +397,6 @@ public class InstagramRipper extends AbstractHTMLRipper {
} }
private boolean pageHasImages(Document doc) { private boolean pageHasImages(Document doc) {
logger.info("BAD DATA: " + stripHTMLTags(doc.html()));
JSONObject json = new JSONObject(stripHTMLTags(doc.html())); JSONObject json = new JSONObject(stripHTMLTags(doc.html()));
int numberOfImages = json.getJSONObject("data").getJSONObject("user") int numberOfImages = json.getJSONObject("data").getJSONObject("user")
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length(); .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length();