Added Natalie.mu news album ripper.

Rips from both album and news pages.
2015-03-09 21:08:06 +01:00 · 2015-03-09 21:08:06 +01:00 · 82e6343b0e
commit 82e6343b0e
parent 6d040aa0d1
2 changed files with 199 additions and 0 deletions
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/NatalieMuRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/NatalieMuRipper.java
@ -0,0 +1,134 @@
+package com.rarchives.ripme.ripper.rippers;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+
+import com.rarchives.ripme.ripper.AbstractHTMLRipper;
+import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite;
+import com.rarchives.ripme.utils.Http;
+
+public class NatalieMuRipper extends AbstractHTMLRipper {
+    
+    public int news_id = 0;
+    
+    public NatalieMuRipper(URL url) throws IOException {
+        super(url);        
+    }
+
+    @Override
+    public String getHost() {
+        String host = this.url.getHost();
+        host = host.substring(0, host.lastIndexOf('.'));
+        if (host.contains(".")) {
+            // Host has subdomain (www)
+            host = host.substring(host.lastIndexOf('.') + 1);
+        }
+        String board = this.url.toExternalForm().split("/")[3];
+        return host + "_" + board;
+    }
+
+    @Override
+    public boolean canRip(URL url) {    
+        //urls like:
+        // http://cdn2.natalie.mu/music/gallery/show/news_id/xxxxxx/image_id/xxxxxx
+        // http://cdn2.natalie.mu/music/news/140411
+        return  url.toExternalForm().contains("natalie.mu")     // Most chans
+             && (url.toExternalForm().contains("/news_id/") 
+             || url.toExternalForm().contains("/news/")); // 4chan, archive.moe
+    }
+
+    /**
+     * For example the achrives are all known. (Check 4chan-x)
+     * Should be based on the software the specific chan uses.
+     * FoolFuuka uses the same (url) layout as 4chan
+     * */
+    @Override
+    public String getGID(URL url) throws MalformedURLException {
+        Pattern p; Matcher m;
+
+        String u = url.toExternalForm();
+        if (u.contains("/news_id/")) {
+            p = Pattern.compile("/news_id/([0-9]+)/");
+            m = p.matcher(u);
+            if (m.find()) {
+                return m.group(1);
+            }           
+        } else if (u.contains("/news/")) {
+            p = Pattern.compile("/news/([0-9]+)/?");
+            m = p.matcher(u);
+            if (m.find()) {
+                return m.group(1);
+            }        
+        }
+
+        throw new MalformedURLException(
+                "Expected natalie.mu URL formats: "
+                        + "http://natalie.mu/music/news/xxxxxx or http://natalie.mu/music/gallery/show/news_id/xxxxxx/image_id/yyyyyy"
+                        + " Got: " + u);
+    }
+
+    @Override
+    public String getDomain() {
+        return this.url.getHost();
+    }
+
+    @Override
+    public Document getFirstPage() throws IOException {
+        return Http.url(this.url).get();
+    }
+    
+    @Override
+    public List<String> getURLsFromPage(Document page) {
+        List<String> imageURLs = new ArrayList<String>();
+        Pattern p; Matcher m;
+        //select all album thumbnails
+        for (Element span : page.select(".NA_articleGallery span")) {
+            if (!span.hasAttr("style")) { 
+                continue;
+            }
+            String style = span.attr("style").trim();
+            
+            p = Pattern.compile("background-image: url\\((.*list_thumb_inbox.*)\\);", Pattern.CASE_INSENSITIVE);
+            m = p.matcher(style);
+            if (m.find()) {
+                String imgUrl = m.group(1);
+                if (imgUrl.startsWith("//")) {
+                    imgUrl = "http:" + imgUrl;
+                }
+                if (imgUrl.startsWith("/")) {
+                    imgUrl = "http://" + this.url.getHost() + imgUrl;
+                }
+                //convert thumbnail url into fullsize url
+                imgUrl = imgUrl.replace("list_thumb_inbox","xlarge");
+                // Don't download the same URL twice
+                if (imageURLs.contains(imgUrl)) {
+                    logger.debug("Already attempted: " + imgUrl);
+                    continue;
+                }
+                imageURLs.add(imgUrl);
+                if (isThisATest()) {
+                    break;
+                }
+            }                       
+
+            if (isStopped()) {
+                break;
+            }
+        }
+        return imageURLs;
+    }
+
+    @Override
+    public void downloadURL(URL url, int index) {
+        addURLToDownload(url, getPrefix(index), "", this.url.toString(), null);
+    } 
+}
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NatalieMuRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NatalieMuRipperTest.java
@ -0,0 +1,65 @@
+package com.rarchives.ripme.tst.ripper.rippers;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import com.rarchives.ripme.ripper.rippers.NatalieMuRipper;
+
+public class NatalieMuRipperTest extends RippersTest {
+
+    public void testNatalieMuURLFailures() throws IOException {
+        List<URL> failURLs = new ArrayList<URL>();
+        // URLs that should not work
+        for (URL url : failURLs) {
+            try {
+                new NatalieMuRipper(url);
+                fail("Instantiated ripper for URL that should not work: " + url);
+            } catch (Exception e) {
+                // Expected
+                continue;
+            }
+        }
+    }
+
+    public void testNatalieMuURLPasses() throws IOException {
+        List<URL> passURLs    = new ArrayList<URL>();
+        // URLs that should work
+        passURLs.add(new URL("http://natalie.mu/music/news/140367"));
+        passURLs.add(new URL("http://cdn2.natalie.mu/music/news/140411"));
+        passURLs.add(new URL("http://cdn2.natalie.mu/music/gallery/show/news_id/140411/image_id/369655"));
+        passURLs.add(new URL("http://natalie.mu/music/gallery/show/news_id/139146/image_id/365218"));
+        for (URL url : passURLs) {
+            NatalieMuRipper ripper = new NatalieMuRipper(url);
+            ripper.setup();
+            assert(ripper.canRip(url));
+            assertNotNull("Ripper for " + url + " did not have a valid working directory.",
+                          ripper.getWorkingDir());
+            deleteDir(ripper.getWorkingDir());
+        }
+    }
+
+    public void testNatalieMuRipper() throws IOException {
+        List<URL> contentURLs = new ArrayList<URL>();
+        // URLs that should return more than 1 image
+        contentURLs.add(new URL("http://natalie.mu/music/news/140367"));
+        contentURLs.add(new URL("http://cdn2.natalie.mu/music/news/140411"));
+        contentURLs.add(new URL("http://cdn2.natalie.mu/music/gallery/show/news_id/140411/image_id/369655"));
+        contentURLs.add(new URL("http://natalie.mu/music/gallery/show/news_id/139146/image_id/365218"));
+
+        // Most *chans have volatile threads & can't be trusted for integration testing.
+
+        //contentURLs.add(new URL("http://boards.4chan.org/r/res/12225949"));
+        //contentURLs.add(new URL("http://7chan.org/gif/res/23795.html"));
+        //contentURLs.add(new URL("http://unichan2.org/b/res/518004.html"));
+
+        // xchan has an HTTPS certificaiton error...
+        //contentURLs.add(new URL("http://xchan.pw/porn/res/437.html"));
+        for (URL url : contentURLs) {
+            NatalieMuRipper ripper = new NatalieMuRipper(url);
+            testRipper(ripper);
+        }
+    }
+
+}