Insagram ripper now can get all pages of a profile
This commit is contained in:
parent
7356a13da1
commit
545bfce7c9
@ -1,8 +1,11 @@
|
|||||||
package com.rarchives.ripme.ripper.rippers;
|
package com.rarchives.ripme.ripper.rippers;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
|
import java.net.URLConnection;
|
||||||
import java.time.*;
|
import java.time.*;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -25,6 +28,7 @@ import com.rarchives.ripme.utils.Utils;
|
|||||||
|
|
||||||
public class InstagramRipper extends AbstractHTMLRipper {
|
public class InstagramRipper extends AbstractHTMLRipper {
|
||||||
String nextPageID = "";
|
String nextPageID = "";
|
||||||
|
private String qHash;
|
||||||
|
|
||||||
private String userID;
|
private String userID;
|
||||||
|
|
||||||
@ -136,7 +140,21 @@ public class InstagramRipper extends AbstractHTMLRipper {
|
|||||||
throw new MalformedURLException("Unable to find user in " + url);
|
throw new MalformedURLException("Unable to find user in " + url);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String stripHTMLTags(String t) {
|
||||||
|
t = t.replaceAll("<html>\n" +
|
||||||
|
" <head></head>\n" +
|
||||||
|
" <body>", "");
|
||||||
|
t.replaceAll("</body>\n" +
|
||||||
|
"</html>", "");
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private JSONObject getJSONFromPage(Document firstPage) throws IOException {
|
private JSONObject getJSONFromPage(Document firstPage) throws IOException {
|
||||||
|
// Check if this page is HTML + JSON or jsut json
|
||||||
|
if (!firstPage.html().contains("window._sharedData =")) {
|
||||||
|
return new JSONObject(stripHTMLTags(firstPage.html()));
|
||||||
|
}
|
||||||
String jsonText = "";
|
String jsonText = "";
|
||||||
try {
|
try {
|
||||||
for (Element script : firstPage.select("script[type=text/javascript]")) {
|
for (Element script : firstPage.select("script[type=text/javascript]")) {
|
||||||
@ -153,8 +171,10 @@ public class InstagramRipper extends AbstractHTMLRipper {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Document getFirstPage() throws IOException {
|
public Document getFirstPage() throws IOException {
|
||||||
userID = getGID(url);
|
Document p = Http.url(url).get();
|
||||||
return Http.url(url).get();
|
// Get the query hash so we can download the next page
|
||||||
|
qHash = getQHash(p);
|
||||||
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getVideoFromPage(String videoID) {
|
private String getVideoFromPage(String videoID) {
|
||||||
@ -210,14 +230,15 @@ public class InstagramRipper extends AbstractHTMLRipper {
|
|||||||
|
|
||||||
if (!url.toExternalForm().contains("/p/")) {
|
if (!url.toExternalForm().contains("/p/")) {
|
||||||
JSONArray datas = new JSONArray();
|
JSONArray datas = new JSONArray();
|
||||||
|
// This first try only works on data from the first page
|
||||||
try {
|
try {
|
||||||
JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage");
|
JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage");
|
||||||
|
userID = profilePage.getJSONObject(0).getString("logging_page_id").replaceAll("profilePage_", "");
|
||||||
datas = profilePage.getJSONObject(0).getJSONObject("graphql").getJSONObject("user")
|
datas = profilePage.getJSONObject(0).getJSONObject("graphql").getJSONObject("user")
|
||||||
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges");
|
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges");
|
||||||
} catch (JSONException e) {
|
} catch (JSONException e) {
|
||||||
// Handle hashtag pages
|
datas = json.getJSONObject("data").getJSONObject("user")
|
||||||
datas = json.getJSONObject("entry_data").getJSONArray("TagPage").getJSONObject(0)
|
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges");
|
||||||
.getJSONObject("tag").getJSONObject("media").getJSONArray("nodes");
|
|
||||||
}
|
}
|
||||||
for (int i = 0; i < datas.length(); i++) {
|
for (int i = 0; i < datas.length(); i++) {
|
||||||
JSONObject data = (JSONObject) datas.get(i);
|
JSONObject data = (JSONObject) datas.get(i);
|
||||||
@ -281,14 +302,11 @@ public class InstagramRipper extends AbstractHTMLRipper {
|
|||||||
// Sleep for a while to avoid a ban
|
// Sleep for a while to avoid a ban
|
||||||
sleep(2500);
|
sleep(2500);
|
||||||
if (url.toExternalForm().substring(url.toExternalForm().length() - 1).equals("/")) {
|
if (url.toExternalForm().substring(url.toExternalForm().length() - 1).equals("/")) {
|
||||||
toreturn = Http.url(url.toExternalForm() + "?max_id=" + nextPageID).get();
|
toreturn = Http.url(url.toExternalForm() + "?max_id=" + nextPageID).ignoreContentType().get();
|
||||||
} else {
|
} else {
|
||||||
toreturn = Http.url(url.toExternalForm() + "/?max_id=" + nextPageID).get();
|
toreturn = Http.url(url.toExternalForm() + "/?max_id=" + nextPageID).ignoreContentType().get();
|
||||||
}
|
}
|
||||||
logger.info(toreturn.html());
|
logger.info(toreturn.html());
|
||||||
if (!hasImage(toreturn)) {
|
|
||||||
throw new IOException("No more pages");
|
|
||||||
}
|
|
||||||
return toreturn;
|
return toreturn;
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
@ -299,8 +317,9 @@ public class InstagramRipper extends AbstractHTMLRipper {
|
|||||||
try {
|
try {
|
||||||
// Sleep for a while to avoid a ban
|
// Sleep for a while to avoid a ban
|
||||||
sleep(2500);
|
sleep(2500);
|
||||||
toreturn = Http.url("https://www.instagram.com/" + userID + "/?max_id=" + nextPageID).get();
|
toreturn = Http.url("https://www.instagram.com/graphql/query/?query_hash=" + qHash + "&variables=" +
|
||||||
if (!hasImage(toreturn)) {
|
"{\"id\":\"" + userID + "\",\"first\":100,\"after\":\"" + nextPageID + "\"}").ignoreContentType().get();
|
||||||
|
if (!pageHasImages(toreturn)) {
|
||||||
throw new IOException("No more pages");
|
throw new IOException("No more pages");
|
||||||
}
|
}
|
||||||
return toreturn;
|
return toreturn;
|
||||||
@ -317,20 +336,46 @@ public class InstagramRipper extends AbstractHTMLRipper {
|
|||||||
addURLToDownload(url);
|
addURLToDownload(url);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasImage(Document doc) {
|
private boolean pageHasImages(Document doc) {
|
||||||
try {
|
JSONObject json = new JSONObject(stripHTMLTags(doc.html()));
|
||||||
JSONObject json = getJSONFromPage(doc);
|
int numberOfImages = json.getJSONObject("data").getJSONObject("user")
|
||||||
JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage");
|
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length();
|
||||||
JSONArray datas = profilePage.getJSONObject(0).getJSONObject("graphql").getJSONObject("user")
|
if (numberOfImages == 0) {
|
||||||
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges");
|
|
||||||
logger.info(datas.length());
|
|
||||||
if (datas.length() == 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
} catch (IOException e) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getQHash(Document doc) {
|
||||||
|
String jsFileURL = "https://www.instagram.com" + doc.select("link[rel=preload]").attr("href");
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
Document jsPage;
|
||||||
|
try {
|
||||||
|
// We can't use Jsoup here because it won't download a non-html file larger than a MB
|
||||||
|
// even if you set maxBodySize to 0
|
||||||
|
URLConnection connection = new URL(jsFileURL).openConnection();
|
||||||
|
BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
|
||||||
|
String line;
|
||||||
|
while ((line = in.readLine()) != null) {
|
||||||
|
sb.append(line);
|
||||||
|
}
|
||||||
|
in.close();
|
||||||
|
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
logger.info("Unable to get query_hash, " + jsFileURL + " is a malformed URL");
|
||||||
|
return null;
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.info("Unable to get query_hash");
|
||||||
|
logger.info(e.getMessage());
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
Pattern jsP = Pattern.compile("o},queryId:.([a-zA-Z0-9]+).");
|
||||||
|
Matcher m = jsP.matcher(sb.toString());
|
||||||
|
if (m.find()) {
|
||||||
|
return m.group(1);
|
||||||
|
}
|
||||||
|
logger.info("Could not find query_hash on " + jsFileURL);
|
||||||
|
return null;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user