Merge pull request #450 from cyian-1756/san-url-history

Added the normalizeUrl func, which allows a ripper to normalize a url before adding it to url histroy/check if its in url history; The instagram ripper now uses this func
This commit is contained in:
cyian-1756 2018-03-06 06:47:41 -05:00 committed by GitHub
commit 543d954941
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 22 additions and 0 deletions

View File

@ -61,7 +61,13 @@ public abstract class AbstractRipper
}
}
/**
* Adds a URL to the url history file
* @param downloadedURL URL to check if downloaded
*/
private void writeDownloadedURL(String downloadedURL) throws IOException {
downloadedURL = normalizeUrl(downloadedURL);
BufferedWriter bw = null;
FileWriter fw = null;
try {
@ -86,6 +92,15 @@ public abstract class AbstractRipper
}
}
}
/**
* Normalize a URL
* @param url URL to check if downloaded
*/
public String normalizeUrl(String url) {
return url;
}
/**
* Checks to see if Ripme has already downloaded a URL
@ -96,6 +111,7 @@ public abstract class AbstractRipper
*/
private boolean hasDownloadedURL(String url) {
File file = new File(URLHistoryFile);
url = normalizeUrl(url);
try {
Scanner scanner = new Scanner(file);
while (scanner.hasNextLine()) {

View File

@ -52,6 +52,12 @@ public class InstagramRipper extends AbstractHTMLRipper {
return san_url;
}
@Override
public String normalizeUrl(String url) {
// Remove the date sig from the url
return url.replaceAll("/[A-Z0-9]{8}/", "/");
}
private List<String> getPostsFromSinglePage(Document Doc) {
List<String> imageURLs = new ArrayList<>();
JSONArray datas;