From 79ffee23714155cfa58fa373eaa28172459a5dd7 Mon Sep 17 00:00:00 2001 From: SDT Date: Wed, 15 Oct 2014 16:27:51 +1100 Subject: [PATCH] Add scraper for idol.sankakucomplex.com tag albums. --- .../ripper/rippers/SankakuComplexRipper.java | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/SankakuComplexRipper.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/SankakuComplexRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/SankakuComplexRipper.java new file mode 100644 index 00000000..4a425775 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/SankakuComplexRipper.java @@ -0,0 +1,99 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLDecoder; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jsoup.Connection.Response; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.utils.Http; + +public class SankakuComplexRipper extends AbstractHTMLRipper { + private Document albumDoc = null; + private Map cookies = new HashMap(); + + public SankakuComplexRipper(URL url) throws IOException { + super(url); + } + + @Override + public String getHost() { + return "sankakucomplex"; + } + + @Override + public String getDomain() { + return "idol.sankakucomplex.com"; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Pattern p = Pattern.compile("^https?://idol\\.sankakucomplex\\.com/.*tags=([^&]+).*$"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + try { + return URLDecoder.decode(m.group(1), "UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new MalformedURLException("Cannot decode tag name '" + m.group(1) + "'"); + } + } + throw new MalformedURLException("Expected sankakucomplex.com URL format: " + + "idol.sankakucomplex.com?...&tags=something... - got " + + url + "instead"); + } + + @Override + public Document getFirstPage() throws IOException { + if (albumDoc == null) { + Response resp = Http.url(url).response(); + cookies.putAll(resp.cookies()); + albumDoc = resp.parse(); + } + return albumDoc; + } + + @Override + public List getURLsFromPage(Document doc) { + List imageURLs = new ArrayList(); + // Image URLs are basically thumbnail URLs with a different domain, a simple + // path replacement, and a ?xxxxxx post ID at the end (obtainable from the href) + for (Element thumbSpan : doc.select("div.content > div > span.thumb")) { + String postId = thumbSpan.attr("id").replaceAll("p", ""); + Element thumb = thumbSpan.getElementsByTag("img").first(); + String image = thumb.attr("abs:src") + .replace("i.sankakucomplex.com/data/preview", + "is.sankakucomplex.com/data") + "?" + postId; + imageURLs.add(image); + } + return imageURLs; + } + + @Override + public void downloadURL(URL url, int index) { + // Mock up the URL of the post page based on the post ID at the end of the URL. + String postId = url.toExternalForm().replaceAll(".*\\?", ""); + String refererURL = "https://idol.sankakucomplex.com/post/show/" + postId; + addURLToDownload(url, postId + "_", "", refererURL, cookies); + } + + @Override + public Document getNextPage(Document doc) throws IOException { + Element pagination = doc.select("div.pagination").first(); + if (pagination.hasAttr("next-page-url")) { + return Http.url(pagination.attr("abs:next-page-url")).cookies(cookies).get(); + } else{ + return null; + } + } +}