From 2f4793e9e39145f070608328016bf8c10b7a8b35 Mon Sep 17 00:00:00 2001 From: kas-luthor Date: Sat, 14 Jan 2017 22:45:23 +0100 Subject: [PATCH 1/2] Added utility functions for parsing URL queries Rewrote E621Ripper to not use regexes anymore (therefore interacting better with special chars in URLs) --- .../ripme/ripper/rippers/E621Ripper.java | 139 +++++++++++------- .../java/com/rarchives/ripme/utils/Utils.java | 69 +++++++++ 2 files changed, 151 insertions(+), 57 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java index 190320f9..6f6731d9 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java @@ -6,10 +6,12 @@ import com.rarchives.ripme.ripper.DownloadThreadPool; import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Utils; import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; +import java.net.URLDecoder; import java.util.ArrayList; import java.util.List; import java.util.logging.Level; @@ -25,69 +27,85 @@ import org.jsoup.select.Elements; * @author */ public class E621Ripper extends AbstractHTMLRipper{ - private static Pattern gidPattern=null; - private static Pattern gidPattern2=null; - private static Pattern gidPatternPool=null; +public static final int POOL_IMAGES_PER_PAGE = 24; - private DownloadThreadPool e621ThreadPool=new DownloadThreadPool("e621"); + private DownloadThreadPool e621ThreadPool = new DownloadThreadPool("e621"); public E621Ripper(URL url) throws IOException { super(url); } - + @Override public DownloadThreadPool getThreadPool() { return e621ThreadPool; } - + @Override public String getDomain() { return "e621.net"; } - + @Override public String getHost() { return "e621"; } - + @Override public Document getFirstPage() throws IOException { - if(url.getPath().startsWith("/pool/show/")) - return Http.url("https://e621.net/pool/show/"+getTerm(url)).get(); + if (url.getPath().startsWith("/pool/show/")) + return Http.url("https://e621.net/pool/show/" + getTerm(url)).get(); else - return Http.url("https://e621.net/post/index/1/"+getTerm(url)).get(); + return Http.url("https://e621.net/post/index/1/" + getTerm(url)).get(); } - + @Override public List getURLsFromPage(Document page) { - Elements elements=page.select("#post-list .thumb a,#pool-show .thumb a"); - List res=new ArrayList(elements.size()); + Elements elements = page.select("#post-list .thumb a,#pool-show .thumb a"); + List res = new ArrayList(elements.size()); - for(Element e:elements){ - res.add(e.absUrl("href")+"#"+e.child(0).attr("id").substring(1)); + if (page.getElementById("pool-show") != null) { + int index = 0; + + Element e = page.getElementById("paginator"); + if (e != null && (e = e.getElementsByClass("current").first()) != null) + index = (Integer.parseInt(e.text()) - 1) * POOL_IMAGES_PER_PAGE; + + for (Element e_ : elements) + res.add(e_.absUrl("href") + "#" + ++index); + + } else { + for (Element e : elements) + res.add(e.absUrl("href") + "#" + e.child(0).attr("id").substring(1)); } return res; } - + @Override public Document getNextPage(Document page) throws IOException { - for(Element e:page.select("#paginator a")){ - if(e.attr("rel").equals("next")) + for (Element e : page.select("#paginator a")) { + if (e.attr("rel").equals("next")) return Http.url(e.absUrl("href")).get(); } return null; } - + @Override public void downloadURL(final URL url, int index) { e621ThreadPool.addThread(new Thread(new Runnable() { public void run() { try { - Document page=Http.url(url).get(); + Document page = Http.url(url).get(); + Element e = page.getElementById("image"); + + if (e != null) + addURLToDownload(new URL(e.absUrl("src")), Utils.getConfigBoolean("download.save_order", true) ? url.getRef() + "-" : ""); + else if ((e = page.select(".content object>param[name=\"movie\"]").first()) != null) + addURLToDownload(new URL(e.absUrl("value")), Utils.getConfigBoolean("download.save_order", true) ? url.getRef() + "-" : ""); + else + Logger.getLogger(E621Ripper.class.getName()).log(Level.WARNING, "Unsupported media type - please report to program author: " + url.toString()); - addURLToDownload(new URL(page.getElementById("image").absUrl("src")),Utils.getConfigBoolean("download.save_order",true)?url.getRef()+"-":""); } catch (IOException ex) { Logger.getLogger(E621Ripper.class.getName()).log(Level.SEVERE, null, ex); } @@ -95,48 +113,55 @@ public class E621Ripper extends AbstractHTMLRipper{ })); } - private String getTerm(URL url) throws MalformedURLException{ - if(gidPattern==null) - gidPattern=Pattern.compile("^https?://(www\\.)?e621\\.net/post/index/[^/]+/([a-zA-Z0-9$_.+!*'(),%-]+)(/.*)?(#.*)?$"); - if(gidPatternPool==null) - gidPatternPool=Pattern.compile("^https?://(www\\.)?e621\\.net/pool/show/([a-zA-Z0-9$_.+!*'(),%-]+)(\\?.*)?(/.*)?(#.*)?$"); - - Matcher m = gidPattern.matcher(url.toExternalForm()); - if(m.matches()) - return m.group(2); + private String getTerm(URL url) throws MalformedURLException { + String query = url.getQuery(); - m = gidPatternPool.matcher(url.toExternalForm()); - if(m.matches()) - return m.group(2); + if (query != null) + return Utils.parseUrlQuery(query, "tags"); - throw new MalformedURLException("Expected e621.net URL format: e621.net/post/index/1/searchterm - got "+url+" instead"); - } - - @Override - public String getGID(URL url) throws MalformedURLException { - try { - String prefix=""; - if(url.getPath().startsWith("/pool/show/")) - prefix="pool_"; - - return Utils.filesystemSafe(prefix+new URI(getTerm(url)).getPath()); - } catch (URISyntaxException ex) { - Logger.getLogger(PahealRipper.class.getName()).log(Level.SEVERE, null, ex); + if (query == null) { + if ((query = url.getPath()).startsWith("/post/index/")) { + query = query.substring(12); + + int pos = query.indexOf('/'); + if (pos == -1) + return null; + + // skip page number + query = query.substring(pos + 1); + + if (query.endsWith("/")) + query = query.substring(0, query.length() - 1); + + try { + return URLDecoder.decode(query, "UTF-8"); + } catch (UnsupportedEncodingException e) { + // Shouldn't happen since UTF-8 is required to be supported + throw new RuntimeException(e); + } + + } else if (query.startsWith("/pool/show/")) { + query = query.substring(11); + + if (query.endsWith("/")) + query = query.substring(0, query.length() - 1); + + return query; + } } - throw new MalformedURLException("Expected e621.net URL format: e621.net/post/index/1/searchterm - got "+url+" instead"); + return null; } - + @Override - public URL sanitizeURL(URL url) throws MalformedURLException { - if(gidPattern2==null) - gidPattern2=Pattern.compile("^https?://(www\\.)?e621\\.net/post/search\\?tags=([a-zA-Z0-9$_.+!*'(),%-]+)(/.*)?(#.*)?$"); + public String getGID(URL url) throws MalformedURLException { + String prefix = ""; + if (url.getPath().startsWith("/pool/show/")) + prefix = "pool_"; + else + prefix = "term_"; - Matcher m = gidPattern2.matcher(url.toExternalForm()); - if(m.matches()) - return new URL("https://e621.net/post/index/1/"+m.group(2).replace("+","%20")); - - return url; + return Utils.filesystemSafe(prefix + getTerm(url)); } } \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/utils/Utils.java b/src/main/java/com/rarchives/ripme/utils/Utils.java index 946fce54..b4e9a311 100644 --- a/src/main/java/com/rarchives/ripme/utils/Utils.java +++ b/src/main/java/com/rarchives/ripme/utils/Utils.java @@ -3,13 +3,16 @@ package com.rarchives.ripme.utils; import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.UnsupportedEncodingException; import java.lang.reflect.Constructor; import java.net.URISyntaxException; import java.net.URL; import java.net.URLDecoder; import java.util.ArrayList; import java.util.Enumeration; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.jar.JarEntry; import java.util.jar.JarFile; @@ -387,4 +390,70 @@ public class Utils { } return result; } + + /** + * Parses an URL query + * + * @param query + * The query part of an URL + * @return The map of all query parameters + */ + public static Map parseUrlQuery(String query) { + Map res = new HashMap(); + + if (query.equals("")) + return res; + + String[] parts = query.split("&"); + int pos; + + try { + for (String part : parts) { + if ((pos = part.indexOf('=')) >= 0) + res.put(URLDecoder.decode(part.substring(0, pos), "UTF-8"), + URLDecoder.decode(part.substring(pos + 1), "UTF-8")); + else + res.put(URLDecoder.decode(part, "UTF-8"), ""); + } + } catch (UnsupportedEncodingException e) { + // Shouldn't happen since UTF-8 is required to be supported + throw new RuntimeException(e); + } + + return res; + } + + /** + * Parses an URL query and returns the requested parameter's value + * + * @param query + * The query part of an URL + * @param key + * The key whose value is requested + * @return The associated value or null if key wasn't found + */ + public static String parseUrlQuery(String query, String key) { + if (query.equals("")) + return null; + + String[] parts = query.split("&"); + int pos; + + try { + for (String part : parts) { + if ((pos = part.indexOf('=')) >= 0) { + if (URLDecoder.decode(part.substring(0, pos), "UTF-8").equals(key)) + return URLDecoder.decode(part.substring(pos + 1), "UTF-8"); + + } else if (URLDecoder.decode(part, "UTF-8").equals(key)) { + return ""; + } + } + } catch (UnsupportedEncodingException e) { + // Shouldn't happen since UTF-8 is required to be supported + throw new RuntimeException(e); + } + + return null; + } } From ab912542268b56ff5bc70e7ab02b2ba152b64a99 Mon Sep 17 00:00:00 2001 From: kas-luthor Date: Wed, 22 Feb 2017 10:31:55 +0100 Subject: [PATCH 2/2] Code style changes --- .../ripme/ripper/rippers/E621Ripper.java | 112 ++++++++------- .../java/com/rarchives/ripme/utils/Utils.java | 133 +++++++++--------- 2 files changed, 131 insertions(+), 114 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java index 6f6731d9..e45d3980 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java @@ -26,71 +26,79 @@ import org.jsoup.select.Elements; * * @author */ -public class E621Ripper extends AbstractHTMLRipper{ -public static final int POOL_IMAGES_PER_PAGE = 24; - +public class E621Ripper extends AbstractHTMLRipper { + public static final int POOL_IMAGES_PER_PAGE = 24; + private DownloadThreadPool e621ThreadPool = new DownloadThreadPool("e621"); - + public E621Ripper(URL url) throws IOException { super(url); } - + @Override public DownloadThreadPool getThreadPool() { return e621ThreadPool; } - + @Override public String getDomain() { return "e621.net"; } - + @Override public String getHost() { return "e621"; } - + @Override public Document getFirstPage() throws IOException { - if (url.getPath().startsWith("/pool/show/")) + if (url.getPath().startsWith("/pool/show/")) { return Http.url("https://e621.net/pool/show/" + getTerm(url)).get(); - else + } else { return Http.url("https://e621.net/post/index/1/" + getTerm(url)).get(); + } } - + @Override public List getURLsFromPage(Document page) { Elements elements = page.select("#post-list .thumb a,#pool-show .thumb a"); List res = new ArrayList(elements.size()); - + if (page.getElementById("pool-show") != null) { int index = 0; - + Element e = page.getElementById("paginator"); - if (e != null && (e = e.getElementsByClass("current").first()) != null) - index = (Integer.parseInt(e.text()) - 1) * POOL_IMAGES_PER_PAGE; - - for (Element e_ : elements) + if (e != null) { + e = e.getElementsByClass("current").first(); + if (e != null) { + index = (Integer.parseInt(e.text()) - 1) * POOL_IMAGES_PER_PAGE; + } + } + + for (Element e_ : elements) { res.add(e_.absUrl("href") + "#" + ++index); - + } + } else { - for (Element e : elements) + for (Element e : elements) { res.add(e.absUrl("href") + "#" + e.child(0).attr("id").substring(1)); + } } - + return res; } - + @Override public Document getNextPage(Document page) throws IOException { for (Element e : page.select("#paginator a")) { - if (e.attr("rel").equals("next")) + if (e.attr("rel").equals("next")) { return Http.url(e.absUrl("href")).get(); + } } - + return null; } - + @Override public void downloadURL(final URL url, int index) { e621ThreadPool.addThread(new Thread(new Runnable() { @@ -98,70 +106,76 @@ public static final int POOL_IMAGES_PER_PAGE = 24; try { Document page = Http.url(url).get(); Element e = page.getElementById("image"); - - if (e != null) + + if (e != null) { addURLToDownload(new URL(e.absUrl("src")), Utils.getConfigBoolean("download.save_order", true) ? url.getRef() + "-" : ""); - else if ((e = page.select(".content object>param[name=\"movie\"]").first()) != null) + } else if ((e = page.select(".content object>param[name=\"movie\"]").first()) != null) { addURLToDownload(new URL(e.absUrl("value")), Utils.getConfigBoolean("download.save_order", true) ? url.getRef() + "-" : ""); - else + } else { Logger.getLogger(E621Ripper.class.getName()).log(Level.WARNING, "Unsupported media type - please report to program author: " + url.toString()); - + } + } catch (IOException ex) { Logger.getLogger(E621Ripper.class.getName()).log(Level.SEVERE, null, ex); } } })); } - + private String getTerm(URL url) throws MalformedURLException { String query = url.getQuery(); - - if (query != null) + + if (query != null) { return Utils.parseUrlQuery(query, "tags"); - + } + if (query == null) { if ((query = url.getPath()).startsWith("/post/index/")) { query = query.substring(12); - + int pos = query.indexOf('/'); - if (pos == -1) + if (pos == -1) { return null; - + } + // skip page number query = query.substring(pos + 1); - - if (query.endsWith("/")) + + if (query.endsWith("/")) { query = query.substring(0, query.length() - 1); - + } + try { return URLDecoder.decode(query, "UTF-8"); } catch (UnsupportedEncodingException e) { // Shouldn't happen since UTF-8 is required to be supported throw new RuntimeException(e); } - + } else if (query.startsWith("/pool/show/")) { query = query.substring(11); - - if (query.endsWith("/")) + + if (query.endsWith("/")) { query = query.substring(0, query.length() - 1); - + } + return query; } } - + return null; } - + @Override public String getGID(URL url) throws MalformedURLException { String prefix = ""; - if (url.getPath().startsWith("/pool/show/")) + if (url.getPath().startsWith("/pool/show/")) { prefix = "pool_"; - else + } else { prefix = "term_"; - + } + return Utils.filesystemSafe(prefix + getTerm(url)); } - + } \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/utils/Utils.java b/src/main/java/com/rarchives/ripme/utils/Utils.java index b4e9a311..f3957360 100644 --- a/src/main/java/com/rarchives/ripme/utils/Utils.java +++ b/src/main/java/com/rarchives/ripme/utils/Utils.java @@ -390,70 +390,73 @@ public class Utils { } return result; } - + /** - * Parses an URL query - * - * @param query - * The query part of an URL - * @return The map of all query parameters - */ - public static Map parseUrlQuery(String query) { - Map res = new HashMap(); - - if (query.equals("")) - return res; - - String[] parts = query.split("&"); - int pos; - - try { - for (String part : parts) { - if ((pos = part.indexOf('=')) >= 0) - res.put(URLDecoder.decode(part.substring(0, pos), "UTF-8"), - URLDecoder.decode(part.substring(pos + 1), "UTF-8")); - else - res.put(URLDecoder.decode(part, "UTF-8"), ""); - } - } catch (UnsupportedEncodingException e) { - // Shouldn't happen since UTF-8 is required to be supported - throw new RuntimeException(e); - } - - return res; - } - - /** - * Parses an URL query and returns the requested parameter's value - * - * @param query - * The query part of an URL - * @param key - * The key whose value is requested - * @return The associated value or null if key wasn't found - */ - public static String parseUrlQuery(String query, String key) { - if (query.equals("")) - return null; - - String[] parts = query.split("&"); - int pos; - - try { - for (String part : parts) { - if ((pos = part.indexOf('=')) >= 0) { - if (URLDecoder.decode(part.substring(0, pos), "UTF-8").equals(key)) - return URLDecoder.decode(part.substring(pos + 1), "UTF-8"); - - } else if (URLDecoder.decode(part, "UTF-8").equals(key)) { - return ""; - } - } - } catch (UnsupportedEncodingException e) { - // Shouldn't happen since UTF-8 is required to be supported - throw new RuntimeException(e); - } - - return null; - } + * Parses an URL query + * + * @param query + * The query part of an URL + * @return The map of all query parameters + */ + public static Map parseUrlQuery(String query) { + Map res = new HashMap(); + + if (query.equals("")){ + return res; + } + + String[] parts = query.split("&"); + int pos; + + try { + for (String part : parts) { + if ((pos = part.indexOf('=')) >= 0){ + res.put(URLDecoder.decode(part.substring(0, pos), "UTF-8"), URLDecoder.decode(part.substring(pos + 1), "UTF-8")); + }else{ + res.put(URLDecoder.decode(part, "UTF-8"), ""); + } + } + } catch (UnsupportedEncodingException e) { + // Shouldn't happen since UTF-8 is required to be supported + throw new RuntimeException(e); + } + + return res; + } + + /** + * Parses an URL query and returns the requested parameter's value + * + * @param query + * The query part of an URL + * @param key + * The key whose value is requested + * @return The associated value or null if key wasn't found + */ + public static String parseUrlQuery(String query, String key) { + if (query.equals("")){ + return null; + } + + String[] parts = query.split("&"); + int pos; + + try { + for (String part : parts) { + if ((pos = part.indexOf('=')) >= 0) { + if (URLDecoder.decode(part.substring(0, pos), "UTF-8").equals(key)){ + return URLDecoder.decode(part.substring(pos + 1), "UTF-8"); + } + + } else if (URLDecoder.decode(part, "UTF-8").equals(key)) { + return ""; + } + } + } catch (UnsupportedEncodingException e) { + // Shouldn't happen since UTF-8 is required to be supported + throw new RuntimeException(e); + } + + return null; + } }