package com.rarchives.ripme.ripper.rippers; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Connection.Response; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import com.rarchives.ripme.ripper.AbstractHTMLRipper; import com.rarchives.ripme.utils.Http; public class SankakuComplexRipper extends AbstractHTMLRipper { private Document albumDoc = null; private Map cookies = new HashMap(); public SankakuComplexRipper(URL url) throws IOException { super(url); } @Override public String getHost() { return "sankakucomplex"; } @Override public String getDomain() { return "sankakucomplex.com"; } @Override public String getGID(URL url) throws MalformedURLException { Pattern p = Pattern.compile("^https?://([a-zA-Z0-9]+\\.)?sankakucomplex\\.com/.*tags=([^&]+).*$"); Matcher m = p.matcher(url.toExternalForm()); if (m.matches()) { try { return URLDecoder.decode(m.group(2), "UTF-8"); } catch (UnsupportedEncodingException e) { throw new MalformedURLException("Cannot decode tag name '" + m.group(1) + "'"); } } throw new MalformedURLException("Expected sankakucomplex.com URL format: " + "idol.sankakucomplex.com?...&tags=something... - got " + url + "instead"); } @Override public Document getFirstPage() throws IOException { if (albumDoc == null) { Response resp = Http.url(url).response(); cookies.putAll(resp.cookies()); albumDoc = resp.parse(); } return albumDoc; } @Override public List getURLsFromPage(Document doc) { List imageURLs = new ArrayList(); // Image URLs are basically thumbnail URLs with a different domain, a simple // path replacement, and a ?xxxxxx post ID at the end (obtainable from the href) for (Element thumbSpan : doc.select("div.content > div > span.thumb > a")) { String postLink = thumbSpan.attr("href"); try { // Get the page the full sized image is on Document subPage = Http.url("https://chan.sankakucomplex.com" + postLink).get(); logger.info("Checking page " + "https://chan.sankakucomplex.com" + postLink); imageURLs.add("https:" + subPage.select("div[id=post-content] > a > img").attr("src")); } catch (IOException e) { logger.warn("Error while loading page " + postLink, e); continue; } } return imageURLs; } @Override public void downloadURL(URL url, int index) { sleep(8000); addURLToDownload(url, getPrefix(index)); } @Override public Document getNextPage(Document doc) throws IOException { Element pagination = doc.select("div.pagination").first(); if (pagination.hasAttr("next-page-url")) { String nextPage = pagination.attr("abs:next-page-url"); // Only logged in users can see past page 25 // Trying to rip page 26 will throw a no images found error if (nextPage.contains("page=26") == false) { logger.info("Getting next page: " + pagination.attr("abs:next-page-url")); return Http.url(pagination.attr("abs:next-page-url")).cookies(cookies).get(); } } throw new IOException("No more pages"); } }