From c5a8f0cd0885eb6f00b332d5746946a1b2832c92 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Mon, 14 May 2018 08:20:24 -0400 Subject: [PATCH 1/4] Added func to add urls to the download queue --- src/main/java/com/rarchives/ripme/ui/MainWindow.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main/java/com/rarchives/ripme/ui/MainWindow.java b/src/main/java/com/rarchives/ripme/ui/MainWindow.java index 95961f44..59f75e06 100644 --- a/src/main/java/com/rarchives/ripme/ui/MainWindow.java +++ b/src/main/java/com/rarchives/ripme/ui/MainWindow.java @@ -162,6 +162,11 @@ public final class MainWindow implements Runnable, RipStatusHandler { return checkbox; } + + public static void addUrlToQueue(String url) { + queueListModel.addElement(url); + } + public MainWindow() { mainFrame = new JFrame("RipMe v" + UpdateUtils.getThisJarVersion()); mainFrame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); From e33d24364e4b4116acf1a637cb4adb7d6cb7e927 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Mon, 14 May 2018 08:21:09 -0400 Subject: [PATCH 2/4] Added a way for rippers to add urls to the download queue --- .../ripme/ripper/AbstractHTMLRipper.java | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java index e0fd3548..436eb122 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java @@ -11,6 +11,7 @@ import org.jsoup.nodes.Document; import com.rarchives.ripme.ui.RipStatusMessage.STATUS; import com.rarchives.ripme.utils.Utils; +import com.rarchives.ripme.ui.MainWindow; /** * Simplified ripper, designed for ripping from sites by parsing HTML. @@ -53,12 +54,29 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { protected boolean hasDescriptionSupport() { return false; } + protected String[] getDescription(String url, Document page) throws IOException { throw new IOException("getDescription not implemented"); // Do I do this or make an abstract function? } protected int descSleepTime() { return 100; } + + protected List getAlbumsToQueue(Document doc) { + return null; + } + + // If a page has Queue support than it has no images we want to download, just a list of urls we want to add to + // the queue + protected boolean hasQueueSupport() { + return false; + } + + // Takes a url and checks if it is for a page of albums + protected boolean pageContainsAlbums(URL url) { + return false; + } + @Override public void rip() throws IOException { int index = 0; @@ -67,6 +85,15 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { sendUpdate(STATUS.LOADING_RESOURCE, this.url.toExternalForm()); Document doc = getFirstPage(); + if (hasQueueSupport() && pageContainsAlbums(this.url)) { + List urls = getAlbumsToQueue(doc); + for (String url : urls) { + MainWindow.addUrlToQueue(url); + } + + doc = null; + } + while (doc != null) { if (alreadyDownloadedUrls >= Utils.getConfigInteger("history.end_rip_after_already_seen", 1000000000) && !isThisATest()) { sendUpdate(STATUS.DOWNLOAD_COMPLETE, "Already seen the last " + alreadyDownloadedUrls + " images ending rip"); From 21a81342d9f1eaa330d04d7b596d7d20966c1509 Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Mon, 14 May 2018 08:21:59 -0400 Subject: [PATCH 3/4] Rewrote the myhentaicomics ripper to use the new getAlbumsToQueue func --- .../ripper/rippers/MyhentaicomicsRipper.java | 185 +++--------------- 1 file changed, 31 insertions(+), 154 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/MyhentaicomicsRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/MyhentaicomicsRipper.java index 5b60c4f2..60c90ae1 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/MyhentaicomicsRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/MyhentaicomicsRipper.java @@ -34,21 +34,18 @@ public class MyhentaicomicsRipper extends AbstractHTMLRipper { Pattern p = Pattern.compile("^https?://myhentaicomics.com/index.php/([a-zA-Z0-9-]*)/?$"); Matcher m = p.matcher(url.toExternalForm()); if (m.matches()) { - isTag = false; return m.group(1); } Pattern pa = Pattern.compile("^https?://myhentaicomics.com/index.php/search\\?q=([a-zA-Z0-9-]*)([a-zA-Z0-9=&]*)?$"); Matcher ma = pa.matcher(url.toExternalForm()); if (ma.matches()) { - isTag = true; return ma.group(1); } Pattern pat = Pattern.compile("^https?://myhentaicomics.com/index.php/tag/([0-9]*)/?([a-zA-Z%0-9+?=:]*)?$"); Matcher mat = pat.matcher(url.toExternalForm()); if (mat.matches()) { - isTag = true; return mat.group(1); } @@ -56,6 +53,37 @@ public class MyhentaicomicsRipper extends AbstractHTMLRipper { "myhentaicomics.com/index.php/albumName - got " + url + " instead"); } + @Override + public boolean hasQueueSupport() { + return true; + } + + @Override + public boolean pageContainsAlbums(URL url) { + Pattern pa = Pattern.compile("^https?://myhentaicomics.com/index.php/search\\?q=([a-zA-Z0-9-]*)([a-zA-Z0-9=&]*)?$"); + Matcher ma = pa.matcher(url.toExternalForm()); + if (ma.matches()) { + return true; + } + + Pattern pat = Pattern.compile("^https?://myhentaicomics.com/index.php/tag/([0-9]*)/?([a-zA-Z%0-9+?=:]*)?$"); + Matcher mat = pat.matcher(url.toExternalForm()); + if (mat.matches()) { + isTag = true; + return true; + } + return false; + } + + @Override + public List getAlbumsToQueue(Document doc) { + List urlsToAddToQueue = new ArrayList<>(); + for (Element elem : doc.select(".g-album > a")) { + urlsToAddToQueue.add(getDomain() + elem.attr("href")); + } + return urlsToAddToQueue; + } + @Override public Document getFirstPage() throws IOException { // "url" is an instance field of the superclass @@ -81,161 +109,11 @@ public class MyhentaicomicsRipper extends AbstractHTMLRipper { return Http.url(nextUrl).get(); } - // This replaces getNextPage when downloading from searchs and tags - private List getNextAlbumPage(String pageUrl) { - List albumPagesList = new ArrayList<>(); - int pageNumber = 1; - albumPagesList.add("http://myhentaicomics.com/index.php/" + pageUrl.split("\\?")[0] + "?page=" + Integer.toString(pageNumber)); - while (true) { - String urlToGet = "http://myhentaicomics.com/index.php/" + pageUrl.split("\\?")[0] + "?page=" + Integer.toString(pageNumber); - Document nextAlbumPage; - try { - logger.info("Grabbing " + urlToGet); - nextAlbumPage = Http.url(urlToGet).get(); - } catch (IOException e) { - logger.warn("Failed to log link in Jsoup"); - nextAlbumPage = null; - e.printStackTrace(); - } - Element elem = nextAlbumPage.select("a.ui-icon-right").first(); - String nextPage = elem.attr("href"); - pageNumber = pageNumber + 1; - if (nextPage.equals("")) { - logger.info("Got " + pageNumber + " pages"); - break; - } - else { - logger.info(nextPage); - albumPagesList.add(nextPage); - logger.info("Adding " + nextPage); - } - } - return albumPagesList; - } - private List getAlbumsFromPage(String url) { - List pagesToRip; - List result = new ArrayList<>(); - logger.info("Running getAlbumsFromPage"); - Document doc; - try { - doc = Http.url("http://myhentaicomics.com" + url).get(); - } catch (IOException e) { - logger.warn("Failed to log link in Jsoup"); - doc = null; - e.printStackTrace(); - } - // This for goes over every album on the page - for (Element elem : doc.select("li.g-album > a")) { - String link = elem.attr("href"); - logger.info("Grabbing album " + link); - pagesToRip = getNextAlbumPage(link); - logger.info(pagesToRip); - for (String element : pagesToRip) { - Document album_doc; - try { - logger.info("grabbing " + element + " with jsoup"); - boolean startsWithHttp = element.startsWith("http://"); - if (!startsWithHttp) { - album_doc = Http.url("http://myhentaicomics.com/" + element).get(); - } - else { - album_doc = Http.url(element).get(); - } - } catch (IOException e) { - logger.warn("Failed to log link in Jsoup"); - album_doc = null; - e.printStackTrace(); - } - for (Element el :album_doc.select("img")) { - String imageSource = el.attr("src"); - // This bool is here so we don't try and download the site logo - if (!imageSource.startsWith("http://")) { - // We replace thumbs with resizes so we can the full sized images - imageSource = imageSource.replace("thumbs", "resizes"); - String url_string = "http://myhentaicomics.com/" + imageSource; - url_string = url_string.replace("%20", "_"); - url_string = url_string.replace("%27", ""); - url_string = url_string.replace("%28", "_"); - url_string = url_string.replace("%29", "_"); - url_string = url_string.replace("%2C", "_"); - if (isTag) { - logger.info("Downloading from a tag or search"); - try { - sleep(500); - result.add("http://myhentaicomics.com/" + imageSource); - addURLToDownload(new URL("http://myhentaicomics.com/" + imageSource), "", url_string.split("/")[6]); - } - catch (MalformedURLException e) { - logger.warn("Malformed URL"); - e.printStackTrace(); - } - } - } - } - } - } - return result; - } - - private List getListOfPages(Document doc) { - List pages = new ArrayList<>(); - // Get the link from the last button - String nextPageUrl = doc.select("a.ui-icon-right").last().attr("href"); - Pattern pat = Pattern.compile("/index\\.php/tag/[0-9]*/[a-zA-Z0-9_\\-:+]*\\?page=(\\d+)"); - Matcher mat = pat.matcher(nextPageUrl); - if (mat.matches()) { - logger.debug("Getting pages from a tag"); - String base_link = mat.group(0).replaceAll("\\?page=\\d+", ""); - logger.debug("base_link is " + base_link); - int numOfPages = Integer.parseInt(mat.group(1)); - for (int x = 1; x != numOfPages +1; x++) { - logger.debug("running loop"); - String link = base_link + "?page=" + Integer.toString(x); - pages.add(link); - } - } else { - Pattern pa = Pattern.compile("/index\\.php/search\\?q=[a-zA-Z0-9_\\-:]*&page=(\\d+)"); - Matcher ma = pa.matcher(nextPageUrl); - if (ma.matches()) { - logger.debug("Getting pages from a search"); - String base_link = ma.group(0).replaceAll("page=\\d+", ""); - logger.debug("base_link is " + base_link); - int numOfPages = Integer.parseInt(ma.group(1)); - for (int x = 1; x != numOfPages +1; x++) { - logger.debug("running loop"); - String link = base_link + "page=" + Integer.toString(x); - logger.debug(link); - pages.add(link); - } - } - } - return pages; - } @Override public List getURLsFromPage(Document doc) { List result = new ArrayList<>(); - // Checks if this is a comic page or a page of albums - // If true the page is a page of albums - if (doc.toString().contains("class=\"g-item g-album\"")) { - // This if checks that there is more than 1 page - if (!doc.select("a.ui-icon-right").last().attr("href").equals("")) { - // There is more than one page so we call getListOfPages - List pagesToRip = getListOfPages(doc); - logger.debug("Pages to rip = " + pagesToRip); - for (String url : pagesToRip) { - logger.debug("Getting albums from " + url); - result = getAlbumsFromPage(url); - } - } else { - logger.debug("There is only one page on this page of albums"); - // There is only 1 page so we call getAlbumsFromPage and pass it the page url - result = getAlbumsFromPage(doc.select("div.g-description > a").attr("href")); - } - return result; - } - else { for (Element el : doc.select("img")) { String imageSource = el.attr("src"); // This bool is here so we don't try and download the site logo @@ -245,7 +123,6 @@ public class MyhentaicomicsRipper extends AbstractHTMLRipper { result.add("http://myhentaicomics.com/" + imageSource); } } - } return result; } From 6a73e6c06bf7de45885175e75b211b3d9685b79d Mon Sep 17 00:00:00 2001 From: cyian-1756 Date: Wed, 16 May 2018 04:31:25 -0400 Subject: [PATCH 4/4] Fixed spelling mistake added another comment --- .../java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java index 436eb122..2eefe873 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java @@ -66,7 +66,7 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { return null; } - // If a page has Queue support than it has no images we want to download, just a list of urls we want to add to + // If a page has Queue support then it has no images we want to download, just a list of urls we want to add to // the queue protected boolean hasQueueSupport() { return false; @@ -91,6 +91,7 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { MainWindow.addUrlToQueue(url); } + // We set doc to null here so the while loop below this doesn't fire doc = null; }