Merge pull request #150 from cyian-1756/ig

Fixed instagram ripper
This commit is contained in:
cyian-1756 2017-11-08 00:23:45 -05:00 committed by GitHub
commit 70d0f0535d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 62 deletions

View File

@ -15,6 +15,9 @@ import org.json.JSONObject;
import com.rarchives.ripme.ripper.AbstractJSONRipper; import com.rarchives.ripme.ripper.AbstractJSONRipper;
import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Http;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class InstagramRipper extends AbstractJSONRipper { public class InstagramRipper extends AbstractJSONRipper {
private String userID; private String userID;
@ -68,42 +71,36 @@ public class InstagramRipper extends AbstractJSONRipper {
throw new IOException("Unable to find userID at " + this.url); throw new IOException("Unable to find userID at " + this.url);
} }
private JSONObject getJSONFromPage(String url) throws IOException {
String jsonText = "";
try {
Document firstPage = Http.url(url).get();
for (Element script : firstPage.select("script[type=text/javascript]")) {
if (script.data().contains("window._sharedData = ")) {
jsonText = script.data().replaceAll("window._sharedData = ", "");
jsonText = jsonText.replaceAll("};", "}");
}
}
return new JSONObject(jsonText);
} catch (JSONException e) {
throw new IOException("Could not get JSON from page " + url);
}
}
@Override @Override
public JSONObject getFirstPage() throws IOException { public JSONObject getFirstPage() throws IOException {
userID = getUserID(url); userID = getUserID(url);
return getJSONFromPage("http://instagram.com/" + userID);
}
String baseURL = "http://instagram.com/" + userID + "/media"; private String getVideoFromPage(String videoID) {
try { try {
return Http.url(baseURL).getJSON(); Document doc = Http.url("https://www.instagram.com/p/" + videoID).get();
} catch (JSONException e) { return doc.select("meta[property=og:video]").attr("content");
throw new IOException("Could not get instagram user via: " + baseURL); } catch (IOException e) {
} logger.warn("Unable to get page " + "https://www.instagram.com/p/" + videoID);
}
@Override
public JSONObject getNextPage(JSONObject json) throws IOException {
boolean nextPageAvailable;
try {
nextPageAvailable = json.getBoolean("more_available");
} catch (Exception e) {
throw new IOException("No additional pages found");
}
if (nextPageAvailable) {
JSONArray items = json.getJSONArray("items");
JSONObject last_item = items.getJSONObject(items.length() - 1);
String nextMaxID = last_item.getString("id");
String baseURL = "http://instagram.com/" + userID + "/media/?max_id=" + nextMaxID;
logger.info("Loading " + baseURL);
sleep(1000);
return Http.url(baseURL).getJSON();
} else {
throw new IOException("No more images found");
} }
return "";
} }
private String getOriginalUrl(String imageURL) { private String getOriginalUrl(String imageURL) {
@ -132,53 +129,42 @@ public class InstagramRipper extends AbstractJSONRipper {
return imageURL; return imageURL;
} }
private String getMedia(JSONObject data) {
String imageURL = "";
JSONObject mediaObject;
if (data.has("videos")) {
mediaObject = data.getJSONObject("videos");
if (!mediaObject.isNull("standard_resolution")) {
imageURL = mediaObject.getJSONObject("standard_resolution").getString("url");
}
} else if (data.has("images")) {
mediaObject = data.getJSONObject("images");
if (!mediaObject.isNull("standard_resolution")) {
imageURL = mediaObject.getJSONObject("standard_resolution").getString("url");
}
}
return imageURL;
}
@Override @Override
public List<String> getURLsFromJSON(JSONObject json) { public List<String> getURLsFromJSON(JSONObject json) {
String nextPageID = "";
List<String> imageURLs = new ArrayList<>(); List<String> imageURLs = new ArrayList<>();
JSONArray datas = json.getJSONArray("items"); JSONArray profilePage = json.getJSONObject("entry_data").getJSONArray("ProfilePage");
JSONArray datas = profilePage.getJSONObject(0).getJSONObject("user").getJSONObject("media").getJSONArray("nodes");
for (int i = 0; i < datas.length(); i++) { for (int i = 0; i < datas.length(); i++) {
JSONObject data = (JSONObject) datas.get(i); JSONObject data = (JSONObject) datas.get(i);
try {
String dataType = data.getString("type"); if (!data.getBoolean("is_video")) {
if (dataType.equals("carousel")) { if (imageURLs.size() == 0) {
JSONArray carouselMedias = data.getJSONArray("carousel_media"); // We add this one item to the array because either wise
for (int carouselIndex = 0; carouselIndex < carouselMedias.length(); carouselIndex++) { // the ripper will error out because we returned an empty array
JSONObject carouselMedia = (JSONObject) carouselMedias.get(carouselIndex); imageURLs.add(data.getString("thumbnail_src"));
String imageURL = getMedia(carouselMedia);
if (!imageURL.equals("")) {
imageURL = getOriginalUrl(imageURL);
imageURLs.add(imageURL);
}
} }
addURLToDownload(new URL(getOriginalUrl(data.getString("thumbnail_src"))));
} else { } else {
String imageURL = getMedia(data); addURLToDownload(new URL(getVideoFromPage(data.getString("code"))));
if (!imageURL.equals("")) {
imageURL = getOriginalUrl(imageURL);
imageURLs.add(imageURL);
} }
} catch (MalformedURLException e) {
return imageURLs;
} }
nextPageID = data.getString("id");
if (isThisATest()) { if (isThisATest()) {
break; break;
} }
} }
if (!nextPageID.equals("") && !isThisATest()) {
try {
// Sleep for a while to avoid a ban
sleep(2500);
getURLsFromJSON(getJSONFromPage("https://www.instagram.com/" + userID + "/?max_id=" + nextPageID));
} catch (IOException e){ return imageURLs;}
}
return imageURLs; return imageURLs;
} }

View File

@ -15,7 +15,6 @@ public class InstagramRipperTest extends RippersTest {
Map<URL, String> testURLs = new HashMap<>(); Map<URL, String> testURLs = new HashMap<>();
testURLs.put(new URL("http://instagram.com/Test_User"), "Test_User"); testURLs.put(new URL("http://instagram.com/Test_User"), "Test_User");
testURLs.put(new URL("http://instagram.com/_test_user_"), "_test_user_"); testURLs.put(new URL("http://instagram.com/_test_user_"), "_test_user_");
testURLs.put(new URL("http://instagram.com/-test-user-"), "-test-user-");
for (URL url : testURLs.keySet()) { for (URL url : testURLs.keySet()) {
InstagramRipper ripper = new InstagramRipper(url); InstagramRipper ripper = new InstagramRipper(url);
ripper.setup(); ripper.setup();