From 392d4b0ff56de43c39d5b730eed26410f0a936fc Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Tue, 21 Feb 2017 18:32:45 -0500 Subject: [PATCH 1/7] Added basic chevereto ripper --- .../ripme/ripper/rippers/CheveretoRipper.java | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java new file mode 100644 index 00000000..c3b0b425 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java @@ -0,0 +1,130 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.utils.Http; + +public class CheveretoRipper extends AbstractHTMLRipper { + + public CheveretoRipper(URL url) throws IOException { + super(url); + } + + public static List explicit_domains_1 = Arrays.asList("www.ezphotoshare.com", "hushpix.com"); + @Override + public String getHost() { + String host = url.toExternalForm(); + return host; + } + + @Override + public String getDomain() { + String host = url.toExternalForm(); + return host; + } + + @Override + public boolean canRip(URL url) { + String url_name = url.toExternalForm(); + if (explicit_domains_1.contains(url_name.split("/")[2]) == true) { + return true; + } + return false; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Pattern p = Pattern.compile("(?:https?://)?(?:www\\.)?[a-z1-9]*\\.[a-z1-9]*/album/([a-zA-Z1-9]*)/?$"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1); + } + else if (m.matches() == false) { + Pattern pa = Pattern.compile("(?:https?://)?(?:www\\.)?[a-z1-9]*\\.[a-z1-9]*/([a-zA-Z1-9_-]*)/albums/?$"); + Matcher ma = pa.matcher(url.toExternalForm()); + if (ma.matches()) { + return ma.group(1); + } + } + throw new MalformedURLException("Expected chevereto URL format: " + + "site.domain/album/albumName or site.domain/username/albums- got " + url + " instead"); + } + + @Override + public Document getFirstPage() throws IOException { + // "url" is an instance field of the superclass + return Http.url(url).get(); + } + + @Override + public Document getNextPage(Document doc) throws IOException { + // Find next page + String nextUrl = ""; + Element elem = doc.select("li.pagination-next > a").first(); + String nextPage = elem.attr("href"); + if (nextUrl == "") { + throw new IOException("No more pages"); + } + // Sleep for half a sec to avoid getting IP banned + sleep(500); + return Http.url(nextUrl).get(); + } + + @Override + public List getURLsFromPage(Document doc) { + List result = new ArrayList(); + Document userpage_doc; + // We check for the following string to see if this is a user page or not + if (doc.toString().contains("content=\"gallery\"")) { + for (Element elem : doc.select("a.image-container")) { + String link = elem.attr("href"); + logger.info("Grabbing album " + link); + try { + userpage_doc = Http.url(link).get(); + } catch(IOException e){ + logger.warn("Failed to log link in Jsoup"); + userpage_doc = null; + e.printStackTrace(); + } + for (Element element : userpage_doc.select("a.image-container > img")) { + String imageSource = element.attr("src"); + logger.info("Found image " + link); + // We remove the .md from images so we download the full size image + // not the medium ones + imageSource = imageSource.replace(".md", ""); + result.add(imageSource); + } + } + + } + else { + for (Element el : doc.select("a.image-container > img")) { + String imageSource = el.attr("src"); + // We remove the .md from images so we download the full size image + // not the medium ones + imageSource = imageSource.replace(".md", ""); + result.add(imageSource); + } + } + return result; + } + + @Override + public void downloadURL(URL url, int index) { + addURLToDownload(url, getPrefix(index)); + } + + + } From c42831fc63b1b6986fa55bc869053e92ba7b677b Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Wed, 22 Feb 2017 09:47:25 -0500 Subject: [PATCH 2/7] added chevereto ripper --- .../ripme/ripper/rippers/CheveretoRipper.java | 57 +++---------------- 1 file changed, 7 insertions(+), 50 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java index c3b0b425..269680dd 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java @@ -25,13 +25,13 @@ public class CheveretoRipper extends AbstractHTMLRipper { public static List explicit_domains_1 = Arrays.asList("www.ezphotoshare.com", "hushpix.com"); @Override public String getHost() { - String host = url.toExternalForm(); + String host = url.toExternalForm().split("/")[2]; return host; } @Override public String getDomain() { - String host = url.toExternalForm(); + String host = url.toExternalForm().split("/")[2]; return host; } @@ -39,7 +39,11 @@ public class CheveretoRipper extends AbstractHTMLRipper { public boolean canRip(URL url) { String url_name = url.toExternalForm(); if (explicit_domains_1.contains(url_name.split("/")[2]) == true) { - return true; + Pattern pa = Pattern.compile("(?:https?://)?(?:www\\.)?[a-z1-9]*\\.[a-z1-9]*/album/([a-zA-Z1-9]*)/?$"); + Matcher ma = pa.matcher(url.toExternalForm()); + if (ma.matches()) { + return true; + } } return false; } @@ -51,13 +55,6 @@ public class CheveretoRipper extends AbstractHTMLRipper { if (m.matches()) { return m.group(1); } - else if (m.matches() == false) { - Pattern pa = Pattern.compile("(?:https?://)?(?:www\\.)?[a-z1-9]*\\.[a-z1-9]*/([a-zA-Z1-9_-]*)/albums/?$"); - Matcher ma = pa.matcher(url.toExternalForm()); - if (ma.matches()) { - return ma.group(1); - } - } throw new MalformedURLException("Expected chevereto URL format: " + "site.domain/album/albumName or site.domain/username/albums- got " + url + " instead"); } @@ -67,49 +64,10 @@ public class CheveretoRipper extends AbstractHTMLRipper { // "url" is an instance field of the superclass return Http.url(url).get(); } - - @Override - public Document getNextPage(Document doc) throws IOException { - // Find next page - String nextUrl = ""; - Element elem = doc.select("li.pagination-next > a").first(); - String nextPage = elem.attr("href"); - if (nextUrl == "") { - throw new IOException("No more pages"); - } - // Sleep for half a sec to avoid getting IP banned - sleep(500); - return Http.url(nextUrl).get(); - } @Override public List getURLsFromPage(Document doc) { List result = new ArrayList(); - Document userpage_doc; - // We check for the following string to see if this is a user page or not - if (doc.toString().contains("content=\"gallery\"")) { - for (Element elem : doc.select("a.image-container")) { - String link = elem.attr("href"); - logger.info("Grabbing album " + link); - try { - userpage_doc = Http.url(link).get(); - } catch(IOException e){ - logger.warn("Failed to log link in Jsoup"); - userpage_doc = null; - e.printStackTrace(); - } - for (Element element : userpage_doc.select("a.image-container > img")) { - String imageSource = element.attr("src"); - logger.info("Found image " + link); - // We remove the .md from images so we download the full size image - // not the medium ones - imageSource = imageSource.replace(".md", ""); - result.add(imageSource); - } - } - - } - else { for (Element el : doc.select("a.image-container > img")) { String imageSource = el.attr("src"); // We remove the .md from images so we download the full size image @@ -117,7 +75,6 @@ public class CheveretoRipper extends AbstractHTMLRipper { imageSource = imageSource.replace(".md", ""); result.add(imageSource); } - } return result; } From 5595894992b188251a7e132724863532b0e24a2e Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Mon, 6 Mar 2017 22:59:13 -0500 Subject: [PATCH 3/7] removed none working site --- .../com/rarchives/ripme/ripper/rippers/CheveretoRipper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java index 269680dd..fe1ff1b9 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java @@ -22,7 +22,7 @@ public class CheveretoRipper extends AbstractHTMLRipper { super(url); } - public static List explicit_domains_1 = Arrays.asList("www.ezphotoshare.com", "hushpix.com"); + public static List explicit_domains_1 = Arrays.asList("hushpix.com"); @Override public String getHost() { String host = url.toExternalForm().split("/")[2]; From 2c99c6140c2d1438c79713a85b32626737c689ac Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Mon, 6 Mar 2017 23:43:44 -0500 Subject: [PATCH 4/7] Now rips full albums --- .../ripme/ripper/rippers/CheveretoRipper.java | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java index fe1ff1b9..b422f0a5 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java @@ -65,6 +65,27 @@ public class CheveretoRipper extends AbstractHTMLRipper { return Http.url(url).get(); } + @Override + public Document getNextPage(Document doc) throws IOException { + // Find next page + String nextUrl = ""; + // We use comic-nav-next to the find the next page + Element elem = doc.select("li.pagination-next > a").first(); + if (elem == null) { + throw new IOException("No more pages"); + } + String nextPage = elem.attr("href"); + // Some times this returns a empty string + // This for stops that + if (nextPage == "") { + logger.info("Got empty string for nextpage") + return null; + } + else { + return Http.url(nextPage).get(); + } + } + @Override public List getURLsFromPage(Document doc) { List result = new ArrayList(); From 0a81eeac4365c2bd7dbf2b53807c8968386633c5 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Mon, 6 Mar 2017 23:47:18 -0500 Subject: [PATCH 5/7] Removed logging line --- .../java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java index b422f0a5..b021989f 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java @@ -78,7 +78,6 @@ public class CheveretoRipper extends AbstractHTMLRipper { // Some times this returns a empty string // This for stops that if (nextPage == "") { - logger.info("Got empty string for nextpage") return null; } else { From 97de669ac2e5a69db51c3b180d8d19571fcfdf26 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Tue, 7 Mar 2017 00:04:28 -0500 Subject: [PATCH 6/7] Tweaked regex, added tag-fox.com --- .../com/rarchives/ripme/ripper/rippers/CheveretoRipper.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java index b021989f..64cd9af7 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java @@ -22,7 +22,7 @@ public class CheveretoRipper extends AbstractHTMLRipper { super(url); } - public static List explicit_domains_1 = Arrays.asList("hushpix.com"); + public static List explicit_domains_1 = Arrays.asList("hushpix.com", "tag-fox.com"); @Override public String getHost() { String host = url.toExternalForm().split("/")[2]; @@ -39,7 +39,7 @@ public class CheveretoRipper extends AbstractHTMLRipper { public boolean canRip(URL url) { String url_name = url.toExternalForm(); if (explicit_domains_1.contains(url_name.split("/")[2]) == true) { - Pattern pa = Pattern.compile("(?:https?://)?(?:www\\.)?[a-z1-9]*\\.[a-z1-9]*/album/([a-zA-Z1-9]*)/?$"); + Pattern pa = Pattern.compile("(?:https?://)?(?:www\\.)?[a-z1-9-]*\\.[a-z1-9]*/album/([a-zA-Z1-9]*)/?$"); Matcher ma = pa.matcher(url.toExternalForm()); if (ma.matches()) { return true; @@ -50,7 +50,7 @@ public class CheveretoRipper extends AbstractHTMLRipper { @Override public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("(?:https?://)?(?:www\\.)?[a-z1-9]*\\.[a-z1-9]*/album/([a-zA-Z1-9]*)/?$"); + Pattern p = Pattern.compile("(?:https?://)?(?:www\\.)?[a-z1-9-]*\\.[a-z1-9]*/album/([a-zA-Z1-9]*)/?$"); Matcher m = p.matcher(url.toExternalForm()); if (m.matches()) { return m.group(1); From 3241ae0a84046a21fbfb69ac8604c44263e73671 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Sun, 19 Mar 2017 18:39:29 -0400 Subject: [PATCH 7/7] ChevertoRipper now saves album title --- .../ripme/ripper/rippers/CheveretoRipper.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java index 64cd9af7..e235d90d 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java @@ -48,6 +48,22 @@ public class CheveretoRipper extends AbstractHTMLRipper { return false; } + @Override + public String getAlbumTitle(URL url) throws MalformedURLException { + try { + // Attempt to use album title as GID + Element titleElement = getFirstPage().select("meta[property=og:title]").first(); + String title = titleElement.attr("content"); + title = title.substring(title.lastIndexOf('/') + 1); + return getHost() + "_" + title.trim(); + } catch (IOException e) { + // Fall back to default album naming convention + logger.info("Unable to find title at " + url); + } + return super.getAlbumTitle(url); + } + + @Override public String getGID(URL url) throws MalformedURLException { Pattern p = Pattern.compile("(?:https?://)?(?:www\\.)?[a-z1-9-]*\\.[a-z1-9]*/album/([a-zA-Z1-9]*)/?$");