From 5c185c05df8789a25c4bd88746c78d8b87a05810 Mon Sep 17 00:00:00 2001 From: cyian-1756 <devnull64@vfemail.net> Date: Sat, 18 Nov 2017 03:52:17 -0500 Subject: [PATCH] Improved instagram regex and added sanitizeURL --- .../rarchives/ripme/ripper/rippers/InstagramRipper.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index c085059c..6a33e71d 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -43,6 +43,13 @@ public class InstagramRipper extends AbstractHTMLRipper { return (url.getHost().endsWith("instagram.com")); } + @Override + public URL sanitizeURL(URL url) throws MalformedURLException { + URL san_url = new URL(url.toExternalForm().replaceAll("\\?hl=\\S*", "")); + logger.info("sanitized URL is " + san_url.toExternalForm()); + return san_url; + } + @Override public String getGID(URL url) throws MalformedURLException { Pattern p = Pattern.compile("^https?://instagram.com/([^/]+)/?"); @@ -51,7 +58,7 @@ public class InstagramRipper extends AbstractHTMLRipper { return m.group(1); } - p = Pattern.compile("^https?://www.instagram.com/([^/]+)/?"); + p = Pattern.compile("^https?://www.instagram.com/([^/]+)/?(?:\\?hl=\\S*)?/?"); m = p.matcher(url.toExternalForm()); if (m.matches()) { return m.group(1);