From 2f4793e9e39145f070608328016bf8c10b7a8b35 Mon Sep 17 00:00:00 2001 From: kas-luthor Date: Sat, 14 Jan 2017 22:45:23 +0100 Subject: [PATCH] Added utility functions for parsing URL queries Rewrote E621Ripper to not use regexes anymore (therefore interacting better with special chars in URLs) --- .../ripme/ripper/rippers/E621Ripper.java | 139 +++++++++++------- .../java/com/rarchives/ripme/utils/Utils.java | 69 +++++++++ 2 files changed, 151 insertions(+), 57 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java index 190320f9..6f6731d9 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java @@ -6,10 +6,12 @@ import com.rarchives.ripme.ripper.DownloadThreadPool; import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Utils; import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; +import java.net.URLDecoder; import java.util.ArrayList; import java.util.List; import java.util.logging.Level; @@ -25,69 +27,85 @@ import org.jsoup.select.Elements; * @author */ public class E621Ripper extends AbstractHTMLRipper{ - private static Pattern gidPattern=null; - private static Pattern gidPattern2=null; - private static Pattern gidPatternPool=null; +public static final int POOL_IMAGES_PER_PAGE = 24; - private DownloadThreadPool e621ThreadPool=new DownloadThreadPool("e621"); + private DownloadThreadPool e621ThreadPool = new DownloadThreadPool("e621"); public E621Ripper(URL url) throws IOException { super(url); } - + @Override public DownloadThreadPool getThreadPool() { return e621ThreadPool; } - + @Override public String getDomain() { return "e621.net"; } - + @Override public String getHost() { return "e621"; } - + @Override public Document getFirstPage() throws IOException { - if(url.getPath().startsWith("/pool/show/")) - return Http.url("https://e621.net/pool/show/"+getTerm(url)).get(); + if (url.getPath().startsWith("/pool/show/")) + return Http.url("https://e621.net/pool/show/" + getTerm(url)).get(); else - return Http.url("https://e621.net/post/index/1/"+getTerm(url)).get(); + return Http.url("https://e621.net/post/index/1/" + getTerm(url)).get(); } - + @Override public List getURLsFromPage(Document page) { - Elements elements=page.select("#post-list .thumb a,#pool-show .thumb a"); - List res=new ArrayList(elements.size()); + Elements elements = page.select("#post-list .thumb a,#pool-show .thumb a"); + List res = new ArrayList(elements.size()); - for(Element e:elements){ - res.add(e.absUrl("href")+"#"+e.child(0).attr("id").substring(1)); + if (page.getElementById("pool-show") != null) { + int index = 0; + + Element e = page.getElementById("paginator"); + if (e != null && (e = e.getElementsByClass("current").first()) != null) + index = (Integer.parseInt(e.text()) - 1) * POOL_IMAGES_PER_PAGE; + + for (Element e_ : elements) + res.add(e_.absUrl("href") + "#" + ++index); + + } else { + for (Element e : elements) + res.add(e.absUrl("href") + "#" + e.child(0).attr("id").substring(1)); } return res; } - + @Override public Document getNextPage(Document page) throws IOException { - for(Element e:page.select("#paginator a")){ - if(e.attr("rel").equals("next")) + for (Element e : page.select("#paginator a")) { + if (e.attr("rel").equals("next")) return Http.url(e.absUrl("href")).get(); } return null; } - + @Override public void downloadURL(final URL url, int index) { e621ThreadPool.addThread(new Thread(new Runnable() { public void run() { try { - Document page=Http.url(url).get(); + Document page = Http.url(url).get(); + Element e = page.getElementById("image"); + + if (e != null) + addURLToDownload(new URL(e.absUrl("src")), Utils.getConfigBoolean("download.save_order", true) ? url.getRef() + "-" : ""); + else if ((e = page.select(".content object>param[name=\"movie\"]").first()) != null) + addURLToDownload(new URL(e.absUrl("value")), Utils.getConfigBoolean("download.save_order", true) ? url.getRef() + "-" : ""); + else + Logger.getLogger(E621Ripper.class.getName()).log(Level.WARNING, "Unsupported media type - please report to program author: " + url.toString()); - addURLToDownload(new URL(page.getElementById("image").absUrl("src")),Utils.getConfigBoolean("download.save_order",true)?url.getRef()+"-":""); } catch (IOException ex) { Logger.getLogger(E621Ripper.class.getName()).log(Level.SEVERE, null, ex); } @@ -95,48 +113,55 @@ public class E621Ripper extends AbstractHTMLRipper{ })); } - private String getTerm(URL url) throws MalformedURLException{ - if(gidPattern==null) - gidPattern=Pattern.compile("^https?://(www\\.)?e621\\.net/post/index/[^/]+/([a-zA-Z0-9$_.+!*'(),%-]+)(/.*)?(#.*)?$"); - if(gidPatternPool==null) - gidPatternPool=Pattern.compile("^https?://(www\\.)?e621\\.net/pool/show/([a-zA-Z0-9$_.+!*'(),%-]+)(\\?.*)?(/.*)?(#.*)?$"); - - Matcher m = gidPattern.matcher(url.toExternalForm()); - if(m.matches()) - return m.group(2); + private String getTerm(URL url) throws MalformedURLException { + String query = url.getQuery(); - m = gidPatternPool.matcher(url.toExternalForm()); - if(m.matches()) - return m.group(2); + if (query != null) + return Utils.parseUrlQuery(query, "tags"); - throw new MalformedURLException("Expected e621.net URL format: e621.net/post/index/1/searchterm - got "+url+" instead"); - } - - @Override - public String getGID(URL url) throws MalformedURLException { - try { - String prefix=""; - if(url.getPath().startsWith("/pool/show/")) - prefix="pool_"; - - return Utils.filesystemSafe(prefix+new URI(getTerm(url)).getPath()); - } catch (URISyntaxException ex) { - Logger.getLogger(PahealRipper.class.getName()).log(Level.SEVERE, null, ex); + if (query == null) { + if ((query = url.getPath()).startsWith("/post/index/")) { + query = query.substring(12); + + int pos = query.indexOf('/'); + if (pos == -1) + return null; + + // skip page number + query = query.substring(pos + 1); + + if (query.endsWith("/")) + query = query.substring(0, query.length() - 1); + + try { + return URLDecoder.decode(query, "UTF-8"); + } catch (UnsupportedEncodingException e) { + // Shouldn't happen since UTF-8 is required to be supported + throw new RuntimeException(e); + } + + } else if (query.startsWith("/pool/show/")) { + query = query.substring(11); + + if (query.endsWith("/")) + query = query.substring(0, query.length() - 1); + + return query; + } } - throw new MalformedURLException("Expected e621.net URL format: e621.net/post/index/1/searchterm - got "+url+" instead"); + return null; } - + @Override - public URL sanitizeURL(URL url) throws MalformedURLException { - if(gidPattern2==null) - gidPattern2=Pattern.compile("^https?://(www\\.)?e621\\.net/post/search\\?tags=([a-zA-Z0-9$_.+!*'(),%-]+)(/.*)?(#.*)?$"); + public String getGID(URL url) throws MalformedURLException { + String prefix = ""; + if (url.getPath().startsWith("/pool/show/")) + prefix = "pool_"; + else + prefix = "term_"; - Matcher m = gidPattern2.matcher(url.toExternalForm()); - if(m.matches()) - return new URL("https://e621.net/post/index/1/"+m.group(2).replace("+","%20")); - - return url; + return Utils.filesystemSafe(prefix + getTerm(url)); } } \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/utils/Utils.java b/src/main/java/com/rarchives/ripme/utils/Utils.java index 946fce54..b4e9a311 100644 --- a/src/main/java/com/rarchives/ripme/utils/Utils.java +++ b/src/main/java/com/rarchives/ripme/utils/Utils.java @@ -3,13 +3,16 @@ package com.rarchives.ripme.utils; import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.UnsupportedEncodingException; import java.lang.reflect.Constructor; import java.net.URISyntaxException; import java.net.URL; import java.net.URLDecoder; import java.util.ArrayList; import java.util.Enumeration; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.jar.JarEntry; import java.util.jar.JarFile; @@ -387,4 +390,70 @@ public class Utils { } return result; } + + /** + * Parses an URL query + * + * @param query + * The query part of an URL + * @return The map of all query parameters + */ + public static Map parseUrlQuery(String query) { + Map res = new HashMap(); + + if (query.equals("")) + return res; + + String[] parts = query.split("&"); + int pos; + + try { + for (String part : parts) { + if ((pos = part.indexOf('=')) >= 0) + res.put(URLDecoder.decode(part.substring(0, pos), "UTF-8"), + URLDecoder.decode(part.substring(pos + 1), "UTF-8")); + else + res.put(URLDecoder.decode(part, "UTF-8"), ""); + } + } catch (UnsupportedEncodingException e) { + // Shouldn't happen since UTF-8 is required to be supported + throw new RuntimeException(e); + } + + return res; + } + + /** + * Parses an URL query and returns the requested parameter's value + * + * @param query + * The query part of an URL + * @param key + * The key whose value is requested + * @return The associated value or null if key wasn't found + */ + public static String parseUrlQuery(String query, String key) { + if (query.equals("")) + return null; + + String[] parts = query.split("&"); + int pos; + + try { + for (String part : parts) { + if ((pos = part.indexOf('=')) >= 0) { + if (URLDecoder.decode(part.substring(0, pos), "UTF-8").equals(key)) + return URLDecoder.decode(part.substring(pos + 1), "UTF-8"); + + } else if (URLDecoder.decode(part, "UTF-8").equals(key)) { + return ""; + } + } + } catch (UnsupportedEncodingException e) { + // Shouldn't happen since UTF-8 is required to be supported + throw new RuntimeException(e); + } + + return null; + } }