Merge pull request #605 from cyian-1756/instagramFixes

Fixed instagram ripper
This commit is contained in:
cyian-1756 2018-05-20 13:38:54 -04:00 committed by GitHub
commit 86bbeb465b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -19,10 +19,11 @@ import org.json.JSONArray;
import org.json.JSONException; import org.json.JSONException;
import org.json.JSONObject; import org.json.JSONObject;
import com.rarchives.ripme.ripper.AbstractHTMLRipper; import com.rarchives.ripme.ripper.AbstractJSONRipper;
import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Http;
import org.jsoup.Connection; import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import com.rarchives.ripme.ui.RipStatusMessage; import com.rarchives.ripme.ui.RipStatusMessage;
@ -30,7 +31,7 @@ import com.rarchives.ripme.utils.Utils;
import java.util.HashMap; import java.util.HashMap;
public class InstagramRipper extends AbstractHTMLRipper { public class InstagramRipper extends AbstractJSONRipper {
String nextPageID = ""; String nextPageID = "";
private String qHash; private String qHash;
private boolean rippingTag = false; private boolean rippingTag = false;
@ -39,6 +40,9 @@ public class InstagramRipper extends AbstractHTMLRipper {
private String userID; private String userID;
private String rhx_gis = null; private String rhx_gis = null;
private String csrftoken; private String csrftoken;
// Run into a weird issue with Jsoup cutting some json pages in half, this is a work around
// see https://github.com/RipMeApp/ripme/issues/601
private String workAroundJsonString;
@ -73,11 +77,9 @@ public class InstagramRipper extends AbstractHTMLRipper {
return url.replaceAll("/[A-Z0-9]{8}/", "/"); return url.replaceAll("/[A-Z0-9]{8}/", "/");
} }
private List<String> getPostsFromSinglePage(Document Doc) { private List<String> getPostsFromSinglePage(JSONObject json) {
List<String> imageURLs = new ArrayList<>(); List<String> imageURLs = new ArrayList<>();
JSONArray datas; JSONArray datas;
try {
JSONObject json = getJSONFromPage(Doc);
if (json.getJSONObject("entry_data").getJSONArray("PostPage") if (json.getJSONObject("entry_data").getJSONArray("PostPage")
.getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media") .getJSONObject(0).getJSONObject("graphql").getJSONObject("shortcode_media")
.has("edge_sidecar_to_children")) { .has("edge_sidecar_to_children")) {
@ -103,10 +105,6 @@ public class InstagramRipper extends AbstractHTMLRipper {
} }
} }
return imageURLs; return imageURLs;
} catch (IOException e) {
logger.error("Unable to get JSON from page " + url.toExternalForm());
return null;
}
} }
@Override @Override
@ -184,14 +182,14 @@ public class InstagramRipper extends AbstractHTMLRipper {
} }
@Override @Override
public Document getFirstPage() throws IOException { public JSONObject getFirstPage() throws IOException {
Connection.Response resp = Http.url(url).response(); Connection.Response resp = Http.url(url).response();
logger.info(resp.cookies()); logger.info(resp.cookies());
csrftoken = resp.cookie("csrftoken"); csrftoken = resp.cookie("csrftoken");
Document p = resp.parse(); Document p = resp.parse();
// Get the query hash so we can download the next page // Get the query hash so we can download the next page
qHash = getQHash(p); qHash = getQHash(p);
return p; return getJSONFromPage(p);
} }
private String getVideoFromPage(String videoID) { private String getVideoFromPage(String videoID) {
@ -235,14 +233,8 @@ public class InstagramRipper extends AbstractHTMLRipper {
} }
@Override @Override
public List<String> getURLsFromPage(Document doc) { public List<String> getURLsFromJSON(JSONObject json) {
List<String> imageURLs = new ArrayList<>(); List<String> imageURLs = new ArrayList<>();
JSONObject json = new JSONObject();
try {
json = getJSONFromPage(doc);
} catch (IOException e) {
logger.warn("Unable to exact json from page");
}
// get the rhx_gis value so we can get the next page later on // get the rhx_gis value so we can get the next page later on
if (rhx_gis == null) { if (rhx_gis == null) {
@ -282,7 +274,7 @@ public class InstagramRipper extends AbstractHTMLRipper {
if (data.getString("__typename").equals("GraphSidecar")) { if (data.getString("__typename").equals("GraphSidecar")) {
try { try {
Document slideShowDoc = Http.url(new URL("https://www.instagram.com/p/" + data.getString("shortcode"))).get(); Document slideShowDoc = Http.url(new URL("https://www.instagram.com/p/" + data.getString("shortcode"))).get();
List<String> toAdd = getPostsFromSinglePage(slideShowDoc); List<String> toAdd = getPostsFromSinglePage(getJSONFromPage(slideShowDoc));
for (int slideShowInt = 0; slideShowInt < toAdd.size(); slideShowInt++) { for (int slideShowInt = 0; slideShowInt < toAdd.size(); slideShowInt++) {
addURLToDownload(new URL(toAdd.get(slideShowInt)), image_date + data.getString("shortcode")); addURLToDownload(new URL(toAdd.get(slideShowInt)), image_date + data.getString("shortcode"));
} }
@ -321,7 +313,7 @@ public class InstagramRipper extends AbstractHTMLRipper {
} else { // We're ripping from a single page } else { // We're ripping from a single page
logger.info("Ripping from single page"); logger.info("Ripping from single page");
imageURLs = getPostsFromSinglePage(doc); imageURLs = getPostsFromSinglePage(json);
} }
return imageURLs; return imageURLs;
@ -348,8 +340,8 @@ public class InstagramRipper extends AbstractHTMLRipper {
} }
@Override @Override
public Document getNextPage(Document doc) throws IOException { public JSONObject getNextPage(JSONObject json) throws IOException {
Document toreturn; JSONObject toreturn;
java.util.Map<String, String> cookies = new HashMap<String, String>(); java.util.Map<String, String> cookies = new HashMap<String, String>();
// This shouldn't be hardcoded and will break one day // This shouldn't be hardcoded and will break one day
cookies.put("ig_pr", "1"); cookies.put("ig_pr", "1");
@ -360,10 +352,13 @@ public class InstagramRipper extends AbstractHTMLRipper {
sleep(2500); sleep(2500);
String vars = "{\"tag_name\":\"" + tagName + "\",\"first\":4,\"after\":\"" + nextPageID + "\"}"; String vars = "{\"tag_name\":\"" + tagName + "\",\"first\":4,\"after\":\"" + nextPageID + "\"}";
String ig_gis = getIGGis(vars); String ig_gis = getIGGis(vars);
toreturn = Http.url("https://www.instagram.com/graphql/query/?query_hash=" + qHash + toreturn = getPage("https://www.instagram.com/graphql/query/?query_hash=" + qHash +
"&variables=" + vars).header("x-instagram-gis", ig_gis).cookies(cookies).ignoreContentType().get(); "&variables=" + vars, ig_gis);
// Sleep for a while to avoid a ban // Sleep for a while to avoid a ban
logger.info(toreturn.html()); logger.info(toreturn);
if (!pageHasImages(toreturn)) {
throw new IOException("No more pages");
}
return toreturn; return toreturn;
} catch (IOException e) { } catch (IOException e) {
@ -377,8 +372,8 @@ public class InstagramRipper extends AbstractHTMLRipper {
String vars = "{\"id\":\"" + userID + "\",\"first\":50,\"after\":\"" + nextPageID + "\"}"; String vars = "{\"id\":\"" + userID + "\",\"first\":50,\"after\":\"" + nextPageID + "\"}";
String ig_gis = getIGGis(vars); String ig_gis = getIGGis(vars);
logger.info(ig_gis); logger.info(ig_gis);
toreturn = Http.url("https://www.instagram.com/graphql/query/?query_hash=" + qHash + "&variables=" + vars
).header("x-instagram-gis", ig_gis).cookies(cookies).ignoreContentType().get(); toreturn = getPage("https://www.instagram.com/graphql/query/?query_hash=" + qHash + "&variables=" + vars, ig_gis);
if (!pageHasImages(toreturn)) { if (!pageHasImages(toreturn)) {
throw new IOException("No more pages"); throw new IOException("No more pages");
} }
@ -396,8 +391,7 @@ public class InstagramRipper extends AbstractHTMLRipper {
addURLToDownload(url); addURLToDownload(url);
} }
private boolean pageHasImages(Document doc) { private boolean pageHasImages(JSONObject json) {
JSONObject json = new JSONObject(stripHTMLTags(doc.html()));
int numberOfImages = json.getJSONObject("data").getJSONObject("user") int numberOfImages = json.getJSONObject("data").getJSONObject("user")
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length(); .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length();
if (numberOfImages == 0) { if (numberOfImages == 0) {
@ -406,6 +400,34 @@ public class InstagramRipper extends AbstractHTMLRipper {
return true; return true;
} }
private JSONObject getPage(String url, String ig_gis) {
StringBuilder sb = new StringBuilder();
try {
// We can't use Jsoup here because it won't download a non-html file larger than a MB
// even if you set maxBodySize to 0
URLConnection connection = new URL(url).openConnection();
connection.setRequestProperty("User-Agent", USER_AGENT);
connection.setRequestProperty("x-instagram-gis", ig_gis);
BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
sb.append(line);
}
in.close();
workAroundJsonString = sb.toString();
return new JSONObject(sb.toString());
} catch (MalformedURLException e) {
logger.info("Unable to get query_hash, " + url + " is a malformed URL");
return null;
} catch (IOException e) {
logger.info("Unable to get query_hash");
logger.info(e.getMessage());
return null;
}
}
private String getQHash(Document doc) { private String getQHash(Document doc) {
String jsFileURL = "https://www.instagram.com" + doc.select("link[rel=preload]").attr("href"); String jsFileURL = "https://www.instagram.com" + doc.select("link[rel=preload]").attr("href");
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();