From 21a81342d9f1eaa330d04d7b596d7d20966c1509 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Mon, 14 May 2018 08:21:59 -0400 Subject: [PATCH] Rewrote the myhentaicomics ripper to use the new getAlbumsToQueue func --- .../ripper/rippers/MyhentaicomicsRipper.java | 185 +++--------------- 1 file changed, 31 insertions(+), 154 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/MyhentaicomicsRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/MyhentaicomicsRipper.java index 5b60c4f2..60c90ae1 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/MyhentaicomicsRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/MyhentaicomicsRipper.java @@ -34,21 +34,18 @@ public class MyhentaicomicsRipper extends AbstractHTMLRipper { Pattern p = Pattern.compile("^https?://myhentaicomics.com/index.php/([a-zA-Z0-9-]*)/?$"); Matcher m = p.matcher(url.toExternalForm()); if (m.matches()) { - isTag = false; return m.group(1); } Pattern pa = Pattern.compile("^https?://myhentaicomics.com/index.php/search\\?q=([a-zA-Z0-9-]*)([a-zA-Z0-9=&]*)?$"); Matcher ma = pa.matcher(url.toExternalForm()); if (ma.matches()) { - isTag = true; return ma.group(1); } Pattern pat = Pattern.compile("^https?://myhentaicomics.com/index.php/tag/([0-9]*)/?([a-zA-Z%0-9+?=:]*)?$"); Matcher mat = pat.matcher(url.toExternalForm()); if (mat.matches()) { - isTag = true; return mat.group(1); } @@ -56,6 +53,37 @@ public class MyhentaicomicsRipper extends AbstractHTMLRipper { "myhentaicomics.com/index.php/albumName - got " + url + " instead"); } + @Override + public boolean hasQueueSupport() { + return true; + } + + @Override + public boolean pageContainsAlbums(URL url) { + Pattern pa = Pattern.compile("^https?://myhentaicomics.com/index.php/search\\?q=([a-zA-Z0-9-]*)([a-zA-Z0-9=&]*)?$"); + Matcher ma = pa.matcher(url.toExternalForm()); + if (ma.matches()) { + return true; + } + + Pattern pat = Pattern.compile("^https?://myhentaicomics.com/index.php/tag/([0-9]*)/?([a-zA-Z%0-9+?=:]*)?$"); + Matcher mat = pat.matcher(url.toExternalForm()); + if (mat.matches()) { + isTag = true; + return true; + } + return false; + } + + @Override + public List getAlbumsToQueue(Document doc) { + List urlsToAddToQueue = new ArrayList<>(); + for (Element elem : doc.select(".g-album > a")) { + urlsToAddToQueue.add(getDomain() + elem.attr("href")); + } + return urlsToAddToQueue; + } + @Override public Document getFirstPage() throws IOException { // "url" is an instance field of the superclass @@ -81,161 +109,11 @@ public class MyhentaicomicsRipper extends AbstractHTMLRipper { return Http.url(nextUrl).get(); } - // This replaces getNextPage when downloading from searchs and tags - private List getNextAlbumPage(String pageUrl) { - List albumPagesList = new ArrayList<>(); - int pageNumber = 1; - albumPagesList.add("http://myhentaicomics.com/index.php/" + pageUrl.split("\\?")[0] + "?page=" + Integer.toString(pageNumber)); - while (true) { - String urlToGet = "http://myhentaicomics.com/index.php/" + pageUrl.split("\\?")[0] + "?page=" + Integer.toString(pageNumber); - Document nextAlbumPage; - try { - logger.info("Grabbing " + urlToGet); - nextAlbumPage = Http.url(urlToGet).get(); - } catch (IOException e) { - logger.warn("Failed to log link in Jsoup"); - nextAlbumPage = null; - e.printStackTrace(); - } - Element elem = nextAlbumPage.select("a.ui-icon-right").first(); - String nextPage = elem.attr("href"); - pageNumber = pageNumber + 1; - if (nextPage.equals("")) { - logger.info("Got " + pageNumber + " pages"); - break; - } - else { - logger.info(nextPage); - albumPagesList.add(nextPage); - logger.info("Adding " + nextPage); - } - } - return albumPagesList; - } - private List getAlbumsFromPage(String url) { - List pagesToRip; - List result = new ArrayList<>(); - logger.info("Running getAlbumsFromPage"); - Document doc; - try { - doc = Http.url("http://myhentaicomics.com" + url).get(); - } catch (IOException e) { - logger.warn("Failed to log link in Jsoup"); - doc = null; - e.printStackTrace(); - } - // This for goes over every album on the page - for (Element elem : doc.select("li.g-album > a")) { - String link = elem.attr("href"); - logger.info("Grabbing album " + link); - pagesToRip = getNextAlbumPage(link); - logger.info(pagesToRip); - for (String element : pagesToRip) { - Document album_doc; - try { - logger.info("grabbing " + element + " with jsoup"); - boolean startsWithHttp = element.startsWith("http://"); - if (!startsWithHttp) { - album_doc = Http.url("http://myhentaicomics.com/" + element).get(); - } - else { - album_doc = Http.url(element).get(); - } - } catch (IOException e) { - logger.warn("Failed to log link in Jsoup"); - album_doc = null; - e.printStackTrace(); - } - for (Element el :album_doc.select("img")) { - String imageSource = el.attr("src"); - // This bool is here so we don't try and download the site logo - if (!imageSource.startsWith("http://")) { - // We replace thumbs with resizes so we can the full sized images - imageSource = imageSource.replace("thumbs", "resizes"); - String url_string = "http://myhentaicomics.com/" + imageSource; - url_string = url_string.replace("%20", "_"); - url_string = url_string.replace("%27", ""); - url_string = url_string.replace("%28", "_"); - url_string = url_string.replace("%29", "_"); - url_string = url_string.replace("%2C", "_"); - if (isTag) { - logger.info("Downloading from a tag or search"); - try { - sleep(500); - result.add("http://myhentaicomics.com/" + imageSource); - addURLToDownload(new URL("http://myhentaicomics.com/" + imageSource), "", url_string.split("/")[6]); - } - catch (MalformedURLException e) { - logger.warn("Malformed URL"); - e.printStackTrace(); - } - } - } - } - } - } - return result; - } - - private List getListOfPages(Document doc) { - List pages = new ArrayList<>(); - // Get the link from the last button - String nextPageUrl = doc.select("a.ui-icon-right").last().attr("href"); - Pattern pat = Pattern.compile("/index\\.php/tag/[0-9]*/[a-zA-Z0-9_\\-:+]*\\?page=(\\d+)"); - Matcher mat = pat.matcher(nextPageUrl); - if (mat.matches()) { - logger.debug("Getting pages from a tag"); - String base_link = mat.group(0).replaceAll("\\?page=\\d+", ""); - logger.debug("base_link is " + base_link); - int numOfPages = Integer.parseInt(mat.group(1)); - for (int x = 1; x != numOfPages +1; x++) { - logger.debug("running loop"); - String link = base_link + "?page=" + Integer.toString(x); - pages.add(link); - } - } else { - Pattern pa = Pattern.compile("/index\\.php/search\\?q=[a-zA-Z0-9_\\-:]*&page=(\\d+)"); - Matcher ma = pa.matcher(nextPageUrl); - if (ma.matches()) { - logger.debug("Getting pages from a search"); - String base_link = ma.group(0).replaceAll("page=\\d+", ""); - logger.debug("base_link is " + base_link); - int numOfPages = Integer.parseInt(ma.group(1)); - for (int x = 1; x != numOfPages +1; x++) { - logger.debug("running loop"); - String link = base_link + "page=" + Integer.toString(x); - logger.debug(link); - pages.add(link); - } - } - } - return pages; - } @Override public List getURLsFromPage(Document doc) { List result = new ArrayList<>(); - // Checks if this is a comic page or a page of albums - // If true the page is a page of albums - if (doc.toString().contains("class=\"g-item g-album\"")) { - // This if checks that there is more than 1 page - if (!doc.select("a.ui-icon-right").last().attr("href").equals("")) { - // There is more than one page so we call getListOfPages - List pagesToRip = getListOfPages(doc); - logger.debug("Pages to rip = " + pagesToRip); - for (String url : pagesToRip) { - logger.debug("Getting albums from " + url); - result = getAlbumsFromPage(url); - } - } else { - logger.debug("There is only one page on this page of albums"); - // There is only 1 page so we call getAlbumsFromPage and pass it the page url - result = getAlbumsFromPage(doc.select("div.g-description > a").attr("href")); - } - return result; - } - else { for (Element el : doc.select("img")) { String imageSource = el.attr("src"); // This bool is here so we don't try and download the site logo @@ -245,7 +123,6 @@ public class MyhentaicomicsRipper extends AbstractHTMLRipper { result.add("http://myhentaicomics.com/" + imageSource); } } - } return result; }