From e6c43bb48210f55d59725b7158a29adbaa3d977d Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Mon, 5 Mar 2018 13:16:19 -0500 Subject: [PATCH 1/2] Added the normalizeUrl func, which allows a ripper to normalize a url before adding it to url histroy/check if its in url history --- .../rarchives/ripme/ripper/AbstractRipper.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java index edcff83d..ff6b4102 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java @@ -61,7 +61,13 @@ public abstract class AbstractRipper } } + + /** + * Adds a URL to the url history file + * @param downloadedURL URL to check if downloaded + */ private void writeDownloadedURL(String downloadedURL) throws IOException { + downloadedURL = normalizeUrl(downloadedURL); BufferedWriter bw = null; FileWriter fw = null; try { @@ -86,6 +92,15 @@ public abstract class AbstractRipper } } } + + + /** + * Normalize a URL + * @param url URL to check if downloaded + */ + public String normalizeUrl(String url) { + return url; + } /** * Checks to see if Ripme has already downloaded a URL @@ -96,6 +111,7 @@ public abstract class AbstractRipper */ private boolean hasDownloadedURL(String url) { File file = new File(URLHistoryFile); + url = normalizeUrl(url); try { Scanner scanner = new Scanner(file); while (scanner.hasNextLine()) { From 48ffcf68d37d9bce2e866b4456af8f65dd8f3fdf Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Mon, 5 Mar 2018 13:20:12 -0500 Subject: [PATCH 2/2] Added normalizeUrl --- .../com/rarchives/ripme/ripper/rippers/InstagramRipper.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index 076fcfc6..d1f16535 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -52,6 +52,12 @@ public class InstagramRipper extends AbstractHTMLRipper { return san_url; } + @Override + public String normalizeUrl(String url) { + // Remove the date sig from the url + return url.replaceAll("/[A-Z0-9]{8}/", "/"); + } + private List getPostsFromSinglePage(Document Doc) { List imageURLs = new ArrayList<>(); JSONArray datas;