Merge pull request #450 from cyian-1756/san-url-history

Added the normalizeUrl func, which allows a ripper to normalize a url before adding it to url histroy/check if its in url history; The instagram ripper now uses this func
This commit is contained in:
cyian-1756 2018-03-06 06:47:41 -05:00 committed by GitHub
commit 543d954941
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 22 additions and 0 deletions

View File

@ -61,7 +61,13 @@ public abstract class AbstractRipper
} }
} }
/**
* Adds a URL to the url history file
* @param downloadedURL URL to check if downloaded
*/
private void writeDownloadedURL(String downloadedURL) throws IOException { private void writeDownloadedURL(String downloadedURL) throws IOException {
downloadedURL = normalizeUrl(downloadedURL);
BufferedWriter bw = null; BufferedWriter bw = null;
FileWriter fw = null; FileWriter fw = null;
try { try {
@ -87,6 +93,15 @@ public abstract class AbstractRipper
} }
} }
/**
* Normalize a URL
* @param url URL to check if downloaded
*/
public String normalizeUrl(String url) {
return url;
}
/** /**
* Checks to see if Ripme has already downloaded a URL * Checks to see if Ripme has already downloaded a URL
* @param url URL to check if downloaded * @param url URL to check if downloaded
@ -96,6 +111,7 @@ public abstract class AbstractRipper
*/ */
private boolean hasDownloadedURL(String url) { private boolean hasDownloadedURL(String url) {
File file = new File(URLHistoryFile); File file = new File(URLHistoryFile);
url = normalizeUrl(url);
try { try {
Scanner scanner = new Scanner(file); Scanner scanner = new Scanner(file);
while (scanner.hasNextLine()) { while (scanner.hasNextLine()) {

View File

@ -52,6 +52,12 @@ public class InstagramRipper extends AbstractHTMLRipper {
return san_url; return san_url;
} }
@Override
public String normalizeUrl(String url) {
// Remove the date sig from the url
return url.replaceAll("/[A-Z0-9]{8}/", "/");
}
private List<String> getPostsFromSinglePage(Document Doc) { private List<String> getPostsFromSinglePage(Document Doc) {
List<String> imageURLs = new ArrayList<>(); List<String> imageURLs = new ArrayList<>();
JSONArray datas; JSONArray datas;