Improved instagram regex and added sanitizeURL

This commit is contained in:
cyian-1756 2017-11-18 03:52:17 -05:00
parent 67db5f3d99
commit 5c185c05df

View File

@ -43,6 +43,13 @@ public class InstagramRipper extends AbstractHTMLRipper {
return (url.getHost().endsWith("instagram.com"));
}
@Override
public URL sanitizeURL(URL url) throws MalformedURLException {
URL san_url = new URL(url.toExternalForm().replaceAll("\\?hl=\\S*", ""));
logger.info("sanitized URL is " + san_url.toExternalForm());
return san_url;
}
@Override
public String getGID(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://instagram.com/([^/]+)/?");
@ -51,7 +58,7 @@ public class InstagramRipper extends AbstractHTMLRipper {
return m.group(1);
}
p = Pattern.compile("^https?://www.instagram.com/([^/]+)/?");
p = Pattern.compile("^https?://www.instagram.com/([^/]+)/?(?:\\?hl=\\S*)?/?");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
return m.group(1);