Merge pull request #450 from cyian-1756/san-url-history
Added the normalizeUrl func, which allows a ripper to normalize a url before adding it to url histroy/check if its in url history; The instagram ripper now uses this func
This commit is contained in:
commit
543d954941
@ -61,7 +61,13 @@ public abstract class AbstractRipper
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a URL to the url history file
|
||||||
|
* @param downloadedURL URL to check if downloaded
|
||||||
|
*/
|
||||||
private void writeDownloadedURL(String downloadedURL) throws IOException {
|
private void writeDownloadedURL(String downloadedURL) throws IOException {
|
||||||
|
downloadedURL = normalizeUrl(downloadedURL);
|
||||||
BufferedWriter bw = null;
|
BufferedWriter bw = null;
|
||||||
FileWriter fw = null;
|
FileWriter fw = null;
|
||||||
try {
|
try {
|
||||||
@ -86,6 +92,15 @@ public abstract class AbstractRipper
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize a URL
|
||||||
|
* @param url URL to check if downloaded
|
||||||
|
*/
|
||||||
|
public String normalizeUrl(String url) {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks to see if Ripme has already downloaded a URL
|
* Checks to see if Ripme has already downloaded a URL
|
||||||
@ -96,6 +111,7 @@ public abstract class AbstractRipper
|
|||||||
*/
|
*/
|
||||||
private boolean hasDownloadedURL(String url) {
|
private boolean hasDownloadedURL(String url) {
|
||||||
File file = new File(URLHistoryFile);
|
File file = new File(URLHistoryFile);
|
||||||
|
url = normalizeUrl(url);
|
||||||
try {
|
try {
|
||||||
Scanner scanner = new Scanner(file);
|
Scanner scanner = new Scanner(file);
|
||||||
while (scanner.hasNextLine()) {
|
while (scanner.hasNextLine()) {
|
||||||
|
@ -52,6 +52,12 @@ public class InstagramRipper extends AbstractHTMLRipper {
|
|||||||
return san_url;
|
return san_url;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String normalizeUrl(String url) {
|
||||||
|
// Remove the date sig from the url
|
||||||
|
return url.replaceAll("/[A-Z0-9]{8}/", "/");
|
||||||
|
}
|
||||||
|
|
||||||
private List<String> getPostsFromSinglePage(Document Doc) {
|
private List<String> getPostsFromSinglePage(Document Doc) {
|
||||||
List<String> imageURLs = new ArrayList<>();
|
List<String> imageURLs = new ArrayList<>();
|
||||||
JSONArray datas;
|
JSONArray datas;
|
||||||
|
Loading…
Reference in New Issue
Block a user