From 6290aa1f35e76232df8d23ec9fd15266d1307248 Mon Sep 17 00:00:00 2001 From: kas-luthor Date: Thu, 26 Nov 2015 19:18:03 +0100 Subject: [PATCH 1/3] added ripper for rule34.paheal.net also implemented a file name limit of 128+4 characters since i got errors for too long filenames thanks to extensive tagging --- .../ripme/ripper/rippers/PahealRipper.java | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/PahealRipper.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/PahealRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/PahealRipper.java new file mode 100644 index 00000000..16591eb1 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/PahealRipper.java @@ -0,0 +1,104 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package com.rarchives.ripme.ripper.rippers; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.utils.Http; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +/** + * + * @author + */ +public class PahealRipper extends AbstractHTMLRipper{ + private static Map cookies=null; + private static Pattern gidPattern=null; + + public static Map getCookies() { + if(cookies==null){ + cookies=new HashMap(1); + cookies.put("ui-tnc-agreed","true"); + } + return cookies; + } + + public PahealRipper(URL url) throws IOException { + super(url); + } + + @Override + public String getDomain() { + return "rule34.paheal.net"; + } + + @Override + public String getHost() { + return "paheal"; + } + + @Override + public Document getFirstPage() throws IOException { + return Http.url("http://rule34.paheal.net/post/list/"+getGID(url)+"/1").cookies(getCookies()).get(); + } + + @Override + public Document getNextPage(Document page) throws IOException { + for(Element e:page.select("#paginator a")){ + if(e.text().toLowerCase().equals("next")) + return Http.url(e.absUrl("href")).cookies(getCookies()).get(); + } + + return null; + } + + @Override + public List getURLsFromPage(Document page) { + Elements elements=page.select(".shm-thumb.thumb>a").not(".shm-thumb-link"); + List res=new ArrayList(elements.size()); + + for(Element e:elements) + res.add(e.attr("href")); + + return res; + } + + @Override + public void downloadURL(URL url, int index) { + String file=url.getFile(); + try { + addURLToDownload(new URL(url.getProtocol(),url.getHost(),url.getPort(),file.substring(0, Math.min(128,file.lastIndexOf('.')))+file.substring(file.lastIndexOf('.')))); + } catch (MalformedURLException ex) { + Logger.getLogger(PahealRipper.class.getName()).log(Level.SEVERE, null, ex); + } + } + + @Override + public String getGID(URL url) throws MalformedURLException { + if(gidPattern==null) + gidPattern=Pattern.compile("^https?://(www\\.)?rule34\\.paheal\\.net/post/list/([a-zA-Z0-9$_.+!*'(),-]+)(/.*)?(#.*)?$"); + + Matcher m = gidPattern.matcher(url.toExternalForm()); + if(m.matches()) + return m.group(2); + + throw new MalformedURLException("Expected paheal.net URL format: rule34.paheal.net/post/list/searchterm - got "+url+" instead"); + } + +} From bbc78ef67f24a222dcdd6adef97b61e4feec1f00 Mon Sep 17 00:00:00 2001 From: kas-luthor Date: Thu, 26 Nov 2015 22:18:24 +0100 Subject: [PATCH 2/3] added ripper for xbooru.com removed license header for paheal ripper added percent sign to url matching for paheal ripper --- .../ripme/ripper/rippers/PahealRipper.java | 8 +- .../ripme/ripper/rippers/XbooruRipper.java | 83 +++++++++++++++++++ 2 files changed, 84 insertions(+), 7 deletions(-) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/XbooruRipper.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/PahealRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/PahealRipper.java index 16591eb1..520914a8 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/PahealRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/PahealRipper.java @@ -1,9 +1,3 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - package com.rarchives.ripme.ripper.rippers; import com.rarchives.ripme.ripper.AbstractHTMLRipper; @@ -92,7 +86,7 @@ public class PahealRipper extends AbstractHTMLRipper{ @Override public String getGID(URL url) throws MalformedURLException { if(gidPattern==null) - gidPattern=Pattern.compile("^https?://(www\\.)?rule34\\.paheal\\.net/post/list/([a-zA-Z0-9$_.+!*'(),-]+)(/.*)?(#.*)?$"); + gidPattern=Pattern.compile("^https?://(www\\.)?rule34\\.paheal\\.net/post/list/([a-zA-Z0-9$_.+!*'(),%-]+)(/.*)?(#.*)?$"); Matcher m = gidPattern.matcher(url.toExternalForm()); if(m.matches()) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/XbooruRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/XbooruRipper.java new file mode 100644 index 00000000..dcdf54f5 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/XbooruRipper.java @@ -0,0 +1,83 @@ + +package com.rarchives.ripme.ripper.rippers; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.utils.Http; +import com.rarchives.ripme.utils.Utils; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +/** + * + * @author + */ +public class XbooruRipper extends AbstractHTMLRipper{ + private Pattern gidPattern=null; + + public XbooruRipper(URL url) throws IOException { + super(url); + } + + @Override + public String getDomain() { + return "xbooru.com"; + } + + @Override + public String getHost() { + return "xbooru"; + } + + private String getPage(int num) throws MalformedURLException{ + return "http://xbooru.com/index.php?page=dapi&s=post&q=index&pid="+num+"&tags="+getGID(url); + } + + @Override + public Document getFirstPage() throws IOException { + return Http.url(getPage(0)).get(); + } + + @Override + public Document getNextPage(Document doc) throws IOException { + int offset=Integer.parseInt(doc.getElementsByTag("posts").first().attr("offset")); + int num=Integer.parseInt(doc.getElementsByTag("posts").first().attr("count")); + + if(offset+100>num) + return null; + + return Http.url(getPage(offset/100+1)).get(); + } + + @Override + public List getURLsFromPage(Document page) { + List res=new ArrayList(100); + for(Element e:page.getElementsByTag("post")) + res.add(e.attr("file_url")+"#"+e.attr("id")); + return res; + } + + @Override + public void downloadURL(URL url, int index) { + addURLToDownload(url,Utils.getConfigBoolean("download.save_order",true)?url.getRef()+"-":""); + } + + @Override + public String getGID(URL url) throws MalformedURLException { + if(gidPattern==null) + gidPattern=Pattern.compile("^https?://(www\\.)?xbooru\\.com/(index.php)?.*([?&]tags=([a-zA-Z0-9$_.+!*'(),%-]+))(\\&|(#.*)?$)"); + + Matcher m = gidPattern.matcher(url.toExternalForm()); + if(m.matches()) + return m.group(4); + + throw new MalformedURLException("Expected xbooru.com URL format: xbooru.com - got "+url+" instead"); + } + +} From 5951d51c7ed04a9e48ac82d2a65d4f2ce6f5b039 Mon Sep 17 00:00:00 2001 From: kas-luthor Date: Fri, 27 Nov 2015 00:45:14 +0100 Subject: [PATCH 3/3] added ripper for e621.net refactored paheal.net and xbooru.com rippers: - used absUrl() instead of plain attr() - album names (and file names for paheal) now passed through filesystemSafe() --- .../ripme/ripper/rippers/E621Ripper.java | 142 ++++++++++++++++++ .../ripme/ripper/rippers/PahealRipper.java | 44 ++++-- .../ripme/ripper/rippers/XbooruRipper.java | 30 +++- 3 files changed, 198 insertions(+), 18 deletions(-) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java new file mode 100644 index 00000000..190320f9 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java @@ -0,0 +1,142 @@ + +package com.rarchives.ripme.ripper.rippers; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.ripper.DownloadThreadPool; +import com.rarchives.ripme.utils.Http; +import com.rarchives.ripme.utils.Utils; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +/** + * + * @author + */ +public class E621Ripper extends AbstractHTMLRipper{ + private static Pattern gidPattern=null; + private static Pattern gidPattern2=null; + private static Pattern gidPatternPool=null; + + private DownloadThreadPool e621ThreadPool=new DownloadThreadPool("e621"); + + public E621Ripper(URL url) throws IOException { + super(url); + } + + @Override + public DownloadThreadPool getThreadPool() { + return e621ThreadPool; + } + + @Override + public String getDomain() { + return "e621.net"; + } + + @Override + public String getHost() { + return "e621"; + } + + @Override + public Document getFirstPage() throws IOException { + if(url.getPath().startsWith("/pool/show/")) + return Http.url("https://e621.net/pool/show/"+getTerm(url)).get(); + else + return Http.url("https://e621.net/post/index/1/"+getTerm(url)).get(); + } + + @Override + public List getURLsFromPage(Document page) { + Elements elements=page.select("#post-list .thumb a,#pool-show .thumb a"); + List res=new ArrayList(elements.size()); + + for(Element e:elements){ + res.add(e.absUrl("href")+"#"+e.child(0).attr("id").substring(1)); + } + + return res; + } + + @Override + public Document getNextPage(Document page) throws IOException { + for(Element e:page.select("#paginator a")){ + if(e.attr("rel").equals("next")) + return Http.url(e.absUrl("href")).get(); + } + + return null; + } + + @Override + public void downloadURL(final URL url, int index) { + e621ThreadPool.addThread(new Thread(new Runnable() { + public void run() { + try { + Document page=Http.url(url).get(); + + addURLToDownload(new URL(page.getElementById("image").absUrl("src")),Utils.getConfigBoolean("download.save_order",true)?url.getRef()+"-":""); + } catch (IOException ex) { + Logger.getLogger(E621Ripper.class.getName()).log(Level.SEVERE, null, ex); + } + } + })); + } + + private String getTerm(URL url) throws MalformedURLException{ + if(gidPattern==null) + gidPattern=Pattern.compile("^https?://(www\\.)?e621\\.net/post/index/[^/]+/([a-zA-Z0-9$_.+!*'(),%-]+)(/.*)?(#.*)?$"); + if(gidPatternPool==null) + gidPatternPool=Pattern.compile("^https?://(www\\.)?e621\\.net/pool/show/([a-zA-Z0-9$_.+!*'(),%-]+)(\\?.*)?(/.*)?(#.*)?$"); + + Matcher m = gidPattern.matcher(url.toExternalForm()); + if(m.matches()) + return m.group(2); + + m = gidPatternPool.matcher(url.toExternalForm()); + if(m.matches()) + return m.group(2); + + throw new MalformedURLException("Expected e621.net URL format: e621.net/post/index/1/searchterm - got "+url+" instead"); + } + + @Override + public String getGID(URL url) throws MalformedURLException { + try { + String prefix=""; + if(url.getPath().startsWith("/pool/show/")) + prefix="pool_"; + + return Utils.filesystemSafe(prefix+new URI(getTerm(url)).getPath()); + } catch (URISyntaxException ex) { + Logger.getLogger(PahealRipper.class.getName()).log(Level.SEVERE, null, ex); + } + + throw new MalformedURLException("Expected e621.net URL format: e621.net/post/index/1/searchterm - got "+url+" instead"); + } + + @Override + public URL sanitizeURL(URL url) throws MalformedURLException { + if(gidPattern2==null) + gidPattern2=Pattern.compile("^https?://(www\\.)?e621\\.net/post/search\\?tags=([a-zA-Z0-9$_.+!*'(),%-]+)(/.*)?(#.*)?$"); + + Matcher m = gidPattern2.matcher(url.toExternalForm()); + if(m.matches()) + return new URL("https://e621.net/post/index/1/"+m.group(2).replace("+","%20")); + + return url; + } + +} \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/PahealRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/PahealRipper.java index 520914a8..4db47885 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/PahealRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/PahealRipper.java @@ -2,8 +2,12 @@ package com.rarchives.ripme.ripper.rippers; import com.rarchives.ripme.ripper.AbstractHTMLRipper; import com.rarchives.ripme.utils.Http; +import com.rarchives.ripme.utils.Utils; +import java.io.File; import java.io.IOException; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; @@ -25,7 +29,7 @@ public class PahealRipper extends AbstractHTMLRipper{ private static Map cookies=null; private static Pattern gidPattern=null; - public static Map getCookies() { + private static Map getCookies() { if(cookies==null){ cookies=new HashMap(1); cookies.put("ui-tnc-agreed","true"); @@ -49,7 +53,7 @@ public class PahealRipper extends AbstractHTMLRipper{ @Override public Document getFirstPage() throws IOException { - return Http.url("http://rule34.paheal.net/post/list/"+getGID(url)+"/1").cookies(getCookies()).get(); + return Http.url("http://rule34.paheal.net/post/list/"+getTerm(url)+"/1").cookies(getCookies()).get(); } @Override @@ -68,31 +72,51 @@ public class PahealRipper extends AbstractHTMLRipper{ List res=new ArrayList(elements.size()); for(Element e:elements) - res.add(e.attr("href")); + res.add(e.absUrl("href")); return res; } @Override public void downloadURL(URL url, int index) { - String file=url.getFile(); try { - addURLToDownload(new URL(url.getProtocol(),url.getHost(),url.getPort(),file.substring(0, Math.min(128,file.lastIndexOf('.')))+file.substring(file.lastIndexOf('.')))); - } catch (MalformedURLException ex) { + String name=url.getPath(); + String ext=".png"; + + name=name.substring(name.lastIndexOf('/')+1); + if(name.indexOf('.')>=0){ + ext=name.substring(name.lastIndexOf('.')); + name=name.substring(0,name.length()-ext.length()); + } + + addURLToDownload(url,new File(workingDir.getCanonicalPath()+File.separator+Utils.filesystemSafe(new URI(name).getPath())+ext)); + } catch (IOException ex) { + Logger.getLogger(PahealRipper.class.getName()).log(Level.SEVERE, null, ex); + } catch (URISyntaxException ex) { Logger.getLogger(PahealRipper.class.getName()).log(Level.SEVERE, null, ex); } } - - @Override - public String getGID(URL url) throws MalformedURLException { + + private String getTerm(URL url) throws MalformedURLException{ if(gidPattern==null) gidPattern=Pattern.compile("^https?://(www\\.)?rule34\\.paheal\\.net/post/list/([a-zA-Z0-9$_.+!*'(),%-]+)(/.*)?(#.*)?$"); - + Matcher m = gidPattern.matcher(url.toExternalForm()); if(m.matches()) return m.group(2); throw new MalformedURLException("Expected paheal.net URL format: rule34.paheal.net/post/list/searchterm - got "+url+" instead"); } + + @Override + public String getGID(URL url) throws MalformedURLException { + try { + return Utils.filesystemSafe(new URI(getTerm(url)).getPath()); + } catch (URISyntaxException ex) { + Logger.getLogger(PahealRipper.class.getName()).log(Level.SEVERE, null, ex); + } + + throw new MalformedURLException("Expected paheal.net URL format: rule34.paheal.net/post/list/searchterm - got "+url+" instead"); + } } diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/XbooruRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/XbooruRipper.java index dcdf54f5..6d7d70fc 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/XbooruRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/XbooruRipper.java @@ -6,9 +6,13 @@ import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Utils; import java.io.IOException; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.nodes.Document; @@ -19,7 +23,7 @@ import org.jsoup.nodes.Element; * @author */ public class XbooruRipper extends AbstractHTMLRipper{ - private Pattern gidPattern=null; + private static Pattern gidPattern=null; public XbooruRipper(URL url) throws IOException { super(url); @@ -36,7 +40,7 @@ public class XbooruRipper extends AbstractHTMLRipper{ } private String getPage(int num) throws MalformedURLException{ - return "http://xbooru.com/index.php?page=dapi&s=post&q=index&pid="+num+"&tags="+getGID(url); + return "http://xbooru.com/index.php?page=dapi&s=post&q=index&pid="+num+"&tags="+getTerm(url); } @Override @@ -59,7 +63,7 @@ public class XbooruRipper extends AbstractHTMLRipper{ public List getURLsFromPage(Document page) { List res=new ArrayList(100); for(Element e:page.getElementsByTag("post")) - res.add(e.attr("file_url")+"#"+e.attr("id")); + res.add(e.absUrl("file_url")+"#"+e.attr("id")); return res; } @@ -67,17 +71,27 @@ public class XbooruRipper extends AbstractHTMLRipper{ public void downloadURL(URL url, int index) { addURLToDownload(url,Utils.getConfigBoolean("download.save_order",true)?url.getRef()+"-":""); } - - @Override - public String getGID(URL url) throws MalformedURLException { + + private String getTerm(URL url) throws MalformedURLException{ if(gidPattern==null) gidPattern=Pattern.compile("^https?://(www\\.)?xbooru\\.com/(index.php)?.*([?&]tags=([a-zA-Z0-9$_.+!*'(),%-]+))(\\&|(#.*)?$)"); - + Matcher m = gidPattern.matcher(url.toExternalForm()); if(m.matches()) return m.group(4); - throw new MalformedURLException("Expected xbooru.com URL format: xbooru.com - got "+url+" instead"); + throw new MalformedURLException("Expected xbooru.com URL format: xbooru.com/index.php?tags=searchterm - got "+url+" instead"); + } + + @Override + public String getGID(URL url) throws MalformedURLException { + try { + return Utils.filesystemSafe(new URI(getTerm(url)).getPath()); + } catch (URISyntaxException ex) { + Logger.getLogger(PahealRipper.class.getName()).log(Level.SEVERE, null, ex); + } + + throw new MalformedURLException("Expected xbooru.com URL format: xbooru.com/index.php?tags=searchterm - got "+url+" instead"); } }