From 82e6343b0e5d616ef684c211d2022e0e611a241a Mon Sep 17 00:00:00 2001 From: Erwin de Haan Date: Mon, 9 Mar 2015 21:08:06 +0100 Subject: [PATCH 1/2] Added Natalie.mu news album ripper. Rips from both album and news pages. --- .../ripme/ripper/rippers/NatalieMuRipper.java | 134 ++++++++++++++++++ .../ripper/rippers/NatalieMuRipperTest.java | 65 +++++++++ 2 files changed, 199 insertions(+) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/NatalieMuRipper.java create mode 100644 src/test/java/com/rarchives/ripme/tst/ripper/rippers/NatalieMuRipperTest.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/NatalieMuRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/NatalieMuRipper.java new file mode 100644 index 00000000..51f2bdda --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/NatalieMuRipper.java @@ -0,0 +1,134 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite; +import com.rarchives.ripme.utils.Http; + +public class NatalieMuRipper extends AbstractHTMLRipper { + + public int news_id = 0; + + public NatalieMuRipper(URL url) throws IOException { + super(url); + } + + @Override + public String getHost() { + String host = this.url.getHost(); + host = host.substring(0, host.lastIndexOf('.')); + if (host.contains(".")) { + // Host has subdomain (www) + host = host.substring(host.lastIndexOf('.') + 1); + } + String board = this.url.toExternalForm().split("/")[3]; + return host + "_" + board; + } + + @Override + public boolean canRip(URL url) { + //urls like: + // http://cdn2.natalie.mu/music/gallery/show/news_id/xxxxxx/image_id/xxxxxx + // http://cdn2.natalie.mu/music/news/140411 + return url.toExternalForm().contains("natalie.mu") // Most chans + && (url.toExternalForm().contains("/news_id/") + || url.toExternalForm().contains("/news/")); // 4chan, archive.moe + } + + /** + * For example the achrives are all known. (Check 4chan-x) + * Should be based on the software the specific chan uses. + * FoolFuuka uses the same (url) layout as 4chan + * */ + @Override + public String getGID(URL url) throws MalformedURLException { + Pattern p; Matcher m; + + String u = url.toExternalForm(); + if (u.contains("/news_id/")) { + p = Pattern.compile("/news_id/([0-9]+)/"); + m = p.matcher(u); + if (m.find()) { + return m.group(1); + } + } else if (u.contains("/news/")) { + p = Pattern.compile("/news/([0-9]+)/?"); + m = p.matcher(u); + if (m.find()) { + return m.group(1); + } + } + + throw new MalformedURLException( + "Expected natalie.mu URL formats: " + + "http://natalie.mu/music/news/xxxxxx or http://natalie.mu/music/gallery/show/news_id/xxxxxx/image_id/yyyyyy" + + " Got: " + u); + } + + @Override + public String getDomain() { + return this.url.getHost(); + } + + @Override + public Document getFirstPage() throws IOException { + return Http.url(this.url).get(); + } + + @Override + public List getURLsFromPage(Document page) { + List imageURLs = new ArrayList(); + Pattern p; Matcher m; + //select all album thumbnails + for (Element span : page.select(".NA_articleGallery span")) { + if (!span.hasAttr("style")) { + continue; + } + String style = span.attr("style").trim(); + + p = Pattern.compile("background-image: url\\((.*list_thumb_inbox.*)\\);", Pattern.CASE_INSENSITIVE); + m = p.matcher(style); + if (m.find()) { + String imgUrl = m.group(1); + if (imgUrl.startsWith("//")) { + imgUrl = "http:" + imgUrl; + } + if (imgUrl.startsWith("/")) { + imgUrl = "http://" + this.url.getHost() + imgUrl; + } + //convert thumbnail url into fullsize url + imgUrl = imgUrl.replace("list_thumb_inbox","xlarge"); + // Don't download the same URL twice + if (imageURLs.contains(imgUrl)) { + logger.debug("Already attempted: " + imgUrl); + continue; + } + imageURLs.add(imgUrl); + if (isThisATest()) { + break; + } + } + + if (isStopped()) { + break; + } + } + return imageURLs; + } + + @Override + public void downloadURL(URL url, int index) { + addURLToDownload(url, getPrefix(index), "", this.url.toString(), null); + } +} diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NatalieMuRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NatalieMuRipperTest.java new file mode 100644 index 00000000..2d6c6b44 --- /dev/null +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NatalieMuRipperTest.java @@ -0,0 +1,65 @@ +package com.rarchives.ripme.tst.ripper.rippers; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import com.rarchives.ripme.ripper.rippers.NatalieMuRipper; + +public class NatalieMuRipperTest extends RippersTest { + + public void testNatalieMuURLFailures() throws IOException { + List failURLs = new ArrayList(); + // URLs that should not work + for (URL url : failURLs) { + try { + new NatalieMuRipper(url); + fail("Instantiated ripper for URL that should not work: " + url); + } catch (Exception e) { + // Expected + continue; + } + } + } + + public void testNatalieMuURLPasses() throws IOException { + List passURLs = new ArrayList(); + // URLs that should work + passURLs.add(new URL("http://natalie.mu/music/news/140367")); + passURLs.add(new URL("http://cdn2.natalie.mu/music/news/140411")); + passURLs.add(new URL("http://cdn2.natalie.mu/music/gallery/show/news_id/140411/image_id/369655")); + passURLs.add(new URL("http://natalie.mu/music/gallery/show/news_id/139146/image_id/365218")); + for (URL url : passURLs) { + NatalieMuRipper ripper = new NatalieMuRipper(url); + ripper.setup(); + assert(ripper.canRip(url)); + assertNotNull("Ripper for " + url + " did not have a valid working directory.", + ripper.getWorkingDir()); + deleteDir(ripper.getWorkingDir()); + } + } + + public void testNatalieMuRipper() throws IOException { + List contentURLs = new ArrayList(); + // URLs that should return more than 1 image + contentURLs.add(new URL("http://natalie.mu/music/news/140367")); + contentURLs.add(new URL("http://cdn2.natalie.mu/music/news/140411")); + contentURLs.add(new URL("http://cdn2.natalie.mu/music/gallery/show/news_id/140411/image_id/369655")); + contentURLs.add(new URL("http://natalie.mu/music/gallery/show/news_id/139146/image_id/365218")); + + // Most *chans have volatile threads & can't be trusted for integration testing. + + //contentURLs.add(new URL("http://boards.4chan.org/r/res/12225949")); + //contentURLs.add(new URL("http://7chan.org/gif/res/23795.html")); + //contentURLs.add(new URL("http://unichan2.org/b/res/518004.html")); + + // xchan has an HTTPS certificaiton error... + //contentURLs.add(new URL("http://xchan.pw/porn/res/437.html")); + for (URL url : contentURLs) { + NatalieMuRipper ripper = new NatalieMuRipper(url); + testRipper(ripper); + } + } + +} From 0ba8786e0fec5ad2c9eac57c40897ac4b3462295 Mon Sep 17 00:00:00 2001 From: Erwin de Haan Date: Mon, 9 Mar 2015 21:08:06 +0100 Subject: [PATCH 2/2] Added Natalie.mu news album ripper. Rips from both album and news pages. --- .../ripme/ripper/rippers/NatalieMuRipper.java | 134 ++++++++++++++++++ .../ripper/rippers/NatalieMuRipperTest.java | 65 +++++++++ 2 files changed, 199 insertions(+) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/NatalieMuRipper.java create mode 100644 src/test/java/com/rarchives/ripme/tst/ripper/rippers/NatalieMuRipperTest.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/NatalieMuRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/NatalieMuRipper.java new file mode 100644 index 00000000..51f2bdda --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/NatalieMuRipper.java @@ -0,0 +1,134 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite; +import com.rarchives.ripme.utils.Http; + +public class NatalieMuRipper extends AbstractHTMLRipper { + + public int news_id = 0; + + public NatalieMuRipper(URL url) throws IOException { + super(url); + } + + @Override + public String getHost() { + String host = this.url.getHost(); + host = host.substring(0, host.lastIndexOf('.')); + if (host.contains(".")) { + // Host has subdomain (www) + host = host.substring(host.lastIndexOf('.') + 1); + } + String board = this.url.toExternalForm().split("/")[3]; + return host + "_" + board; + } + + @Override + public boolean canRip(URL url) { + //urls like: + // http://cdn2.natalie.mu/music/gallery/show/news_id/xxxxxx/image_id/xxxxxx + // http://cdn2.natalie.mu/music/news/140411 + return url.toExternalForm().contains("natalie.mu") // Most chans + && (url.toExternalForm().contains("/news_id/") + || url.toExternalForm().contains("/news/")); // 4chan, archive.moe + } + + /** + * For example the achrives are all known. (Check 4chan-x) + * Should be based on the software the specific chan uses. + * FoolFuuka uses the same (url) layout as 4chan + * */ + @Override + public String getGID(URL url) throws MalformedURLException { + Pattern p; Matcher m; + + String u = url.toExternalForm(); + if (u.contains("/news_id/")) { + p = Pattern.compile("/news_id/([0-9]+)/"); + m = p.matcher(u); + if (m.find()) { + return m.group(1); + } + } else if (u.contains("/news/")) { + p = Pattern.compile("/news/([0-9]+)/?"); + m = p.matcher(u); + if (m.find()) { + return m.group(1); + } + } + + throw new MalformedURLException( + "Expected natalie.mu URL formats: " + + "http://natalie.mu/music/news/xxxxxx or http://natalie.mu/music/gallery/show/news_id/xxxxxx/image_id/yyyyyy" + + " Got: " + u); + } + + @Override + public String getDomain() { + return this.url.getHost(); + } + + @Override + public Document getFirstPage() throws IOException { + return Http.url(this.url).get(); + } + + @Override + public List getURLsFromPage(Document page) { + List imageURLs = new ArrayList(); + Pattern p; Matcher m; + //select all album thumbnails + for (Element span : page.select(".NA_articleGallery span")) { + if (!span.hasAttr("style")) { + continue; + } + String style = span.attr("style").trim(); + + p = Pattern.compile("background-image: url\\((.*list_thumb_inbox.*)\\);", Pattern.CASE_INSENSITIVE); + m = p.matcher(style); + if (m.find()) { + String imgUrl = m.group(1); + if (imgUrl.startsWith("//")) { + imgUrl = "http:" + imgUrl; + } + if (imgUrl.startsWith("/")) { + imgUrl = "http://" + this.url.getHost() + imgUrl; + } + //convert thumbnail url into fullsize url + imgUrl = imgUrl.replace("list_thumb_inbox","xlarge"); + // Don't download the same URL twice + if (imageURLs.contains(imgUrl)) { + logger.debug("Already attempted: " + imgUrl); + continue; + } + imageURLs.add(imgUrl); + if (isThisATest()) { + break; + } + } + + if (isStopped()) { + break; + } + } + return imageURLs; + } + + @Override + public void downloadURL(URL url, int index) { + addURLToDownload(url, getPrefix(index), "", this.url.toString(), null); + } +} diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NatalieMuRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NatalieMuRipperTest.java new file mode 100644 index 00000000..2d6c6b44 --- /dev/null +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NatalieMuRipperTest.java @@ -0,0 +1,65 @@ +package com.rarchives.ripme.tst.ripper.rippers; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import com.rarchives.ripme.ripper.rippers.NatalieMuRipper; + +public class NatalieMuRipperTest extends RippersTest { + + public void testNatalieMuURLFailures() throws IOException { + List failURLs = new ArrayList(); + // URLs that should not work + for (URL url : failURLs) { + try { + new NatalieMuRipper(url); + fail("Instantiated ripper for URL that should not work: " + url); + } catch (Exception e) { + // Expected + continue; + } + } + } + + public void testNatalieMuURLPasses() throws IOException { + List passURLs = new ArrayList(); + // URLs that should work + passURLs.add(new URL("http://natalie.mu/music/news/140367")); + passURLs.add(new URL("http://cdn2.natalie.mu/music/news/140411")); + passURLs.add(new URL("http://cdn2.natalie.mu/music/gallery/show/news_id/140411/image_id/369655")); + passURLs.add(new URL("http://natalie.mu/music/gallery/show/news_id/139146/image_id/365218")); + for (URL url : passURLs) { + NatalieMuRipper ripper = new NatalieMuRipper(url); + ripper.setup(); + assert(ripper.canRip(url)); + assertNotNull("Ripper for " + url + " did not have a valid working directory.", + ripper.getWorkingDir()); + deleteDir(ripper.getWorkingDir()); + } + } + + public void testNatalieMuRipper() throws IOException { + List contentURLs = new ArrayList(); + // URLs that should return more than 1 image + contentURLs.add(new URL("http://natalie.mu/music/news/140367")); + contentURLs.add(new URL("http://cdn2.natalie.mu/music/news/140411")); + contentURLs.add(new URL("http://cdn2.natalie.mu/music/gallery/show/news_id/140411/image_id/369655")); + contentURLs.add(new URL("http://natalie.mu/music/gallery/show/news_id/139146/image_id/365218")); + + // Most *chans have volatile threads & can't be trusted for integration testing. + + //contentURLs.add(new URL("http://boards.4chan.org/r/res/12225949")); + //contentURLs.add(new URL("http://7chan.org/gif/res/23795.html")); + //contentURLs.add(new URL("http://unichan2.org/b/res/518004.html")); + + // xchan has an HTTPS certificaiton error... + //contentURLs.add(new URL("http://xchan.pw/porn/res/437.html")); + for (URL url : contentURLs) { + NatalieMuRipper ripper = new NatalieMuRipper(url); + testRipper(ripper); + } + } + +}