ImgScroll/src/main/java/com/rarchives/ripme/ripper/rippers/SankakuComplexRipper.java

package com.rarchives.ripme.ripper.rippers;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.utils.Http;

public class SankakuComplexRipper extends AbstractHTMLRipper {
    private Document albumDoc = null;
    private Map<String,String> cookies = new HashMap<String,String>();

    public SankakuComplexRipper(URL url) throws IOException {
        super(url);
    }

    @Override
    public String getHost() {
        return "sankakucomplex";
    }

    @Override
    public String getDomain() {
        return "sankakucomplex.com";
    }

    @Override
    public String getGID(URL url) throws MalformedURLException {
        Pattern p = Pattern.compile("^https?://([a-zA-Z0-9]+\\.)?sankakucomplex\\.com/.*tags=([^&]+).*$");
        Matcher m = p.matcher(url.toExternalForm());
        if (m.matches()) {
            try {
                return URLDecoder.decode(m.group(1), "UTF-8");
            } catch (UnsupportedEncodingException e) {
                throw new MalformedURLException("Cannot decode tag name '" + m.group(1) + "'");
            }
        }
        throw new MalformedURLException("Expected sankakucomplex.com URL format: " +
                        "idol.sankakucomplex.com?...&tags=something... - got " +
                        url + "instead");
    }

    @Override
    public Document getFirstPage() throws IOException {
        if (albumDoc == null) {
            Response resp = Http.url(url).response();
            cookies.putAll(resp.cookies());
            albumDoc = resp.parse();
        }
        return albumDoc;
    }

    @Override
    public List<String> getURLsFromPage(Document doc) {
        List<String> imageURLs = new ArrayList<String>();
        // Image URLs are basically thumbnail URLs with a different domain, a simple
        // path replacement, and a ?xxxxxx post ID at the end (obtainable from the href)
        for (Element thumbSpan : doc.select("div.content > div > span.thumb > a")) {

            String postLink = thumbSpan.attr("href");
            try {
                // Get the page the full sized image is on
                Document subPage = Http.url("https://chan.sankakucomplex.com" + postLink).get();
                imageURLs.add("https:" + subPage.select("div[id=post-content] > a.sample > img").attr("src"));
            } catch (IOException e) {
                logger.warn("Error while loading page " + postLink, e);
                continue;
            }

        }
        return imageURLs;
    }

    @Override
    public void downloadURL(URL url, int index) {
        // Mock up the URL of the post page based on the post ID at the end of the URL.
        sleep(10000);
        addURLToDownload(url, getPrefix(index));
    }

    @Override
    public Document getNextPage(Document doc) throws IOException {
        Element pagination = doc.select("div.pagination").first();
        if (pagination.hasAttr("next-page-url")) {
            return Http.url(pagination.attr("abs:next-page-url")).cookies(cookies).get();
        } else {
            return null;
        }
    }
}
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`package com.rarchives.ripme.ripper.rippers;`

			`import java.io.IOException;`
			`import java.io.UnsupportedEncodingException;`
			`import java.net.MalformedURLException;`
			`import java.net.URL;`
			`import java.net.URLDecoder;`
			`import java.util.ArrayList;`
			`import java.util.HashMap;`
			`import java.util.List;`
			`import java.util.Map;`
			`import java.util.regex.Matcher;`
			`import java.util.regex.Pattern;`

			`import org.jsoup.Connection.Response;`
			`import org.jsoup.nodes.Document;`
			`import org.jsoup.nodes.Element;`

			`import com.rarchives.ripme.ripper.AbstractHTMLRipper;`
			`import com.rarchives.ripme.utils.Http;`

			`public class SankakuComplexRipper extends AbstractHTMLRipper {`
			`private Document albumDoc = null;`
			`private Map<String,String> cookies = new HashMap<String,String>();`

			`public SankakuComplexRipper(URL url) throws IOException {`
			`super(url);`
			`}`

			`@Override`
			`public String getHost() {`
			`return "sankakucomplex";`
			`}`
Fix style 2017-06-19 19:32:57 +02:00
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`@Override`
			`public String getDomain() {`
Various fixes to tests: Ability to set log level, lots of debugging messages Turn on debug logging during tests, simplified test cases for HTML ripper Fix fusktator ripper, added test Fixed gifyo, added test Added tests for all rippers Adding a few album-guessing URLs 2015-02-10 08:29:29 +01:00			`return "sankakucomplex.com";`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`}`

			`@Override`
			`public String getGID(URL url) throws MalformedURLException {`
Various fixes to tests: Ability to set log level, lots of debugging messages Turn on debug logging during tests, simplified test cases for HTML ripper Fix fusktator ripper, added test Fixed gifyo, added test Added tests for all rippers Adding a few album-guessing URLs 2015-02-10 08:29:29 +01:00			`Pattern p = Pattern.compile("^https?://([a-zA-Z0-9]+\\.)?sankakucomplex\\.com/.tags=([^&]+).$");`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`Matcher m = p.matcher(url.toExternalForm());`
			`if (m.matches()) {`
			`try {`
			`return URLDecoder.decode(m.group(1), "UTF-8");`
			`} catch (UnsupportedEncodingException e) {`
			`throw new MalformedURLException("Cannot decode tag name '" + m.group(1) + "'");`
			`}`
			`}`
			`throw new MalformedURLException("Expected sankakucomplex.com URL format: " +`
			`"idol.sankakucomplex.com?...&tags=something... - got " +`
			`url + "instead");`
			`}`
Fix style 2017-06-19 19:32:57 +02:00
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`@Override`
			`public Document getFirstPage() throws IOException {`
			`if (albumDoc == null) {`
			`Response resp = Http.url(url).response();`
			`cookies.putAll(resp.cookies());`
			`albumDoc = resp.parse();`
			`}`
			`return albumDoc;`
			`}`
Fix style 2017-06-19 19:32:57 +02:00
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`@Override`
			`public List<String> getURLsFromPage(Document doc) {`
			`List<String> imageURLs = new ArrayList<String>();`
			`// Image URLs are basically thumbnail URLs with a different domain, a simple`
			`// path replacement, and a ?xxxxxx post ID at the end (obtainable from the href)`
Now downloads full sized images 2017-10-14 03:32:34 +02:00			`for (Element thumbSpan : doc.select("div.content > div > span.thumb > a")) {`

			`String postLink = thumbSpan.attr("href");`
			`try {`
			`// Get the page the full sized image is on`
			`Document subPage = Http.url("https://chan.sankakucomplex.com" + postLink).get();`
			`imageURLs.add("https:" + subPage.select("div[id=post-content] > a.sample > img").attr("src"));`
			`} catch (IOException e) {`
			`logger.warn("Error while loading page " + postLink, e);`
			`continue;`
			`}`

Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`}`
			`return imageURLs;`
			`}`
Fix style 2017-06-19 19:32:57 +02:00
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`@Override`
			`public void downloadURL(URL url, int index) {`
			`// Mock up the URL of the post page based on the post ID at the end of the URL.`
Now downloads full sized images 2017-10-14 03:32:34 +02:00			`sleep(10000);`
			`addURLToDownload(url, getPrefix(index));`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`}`

			`@Override`
			`public Document getNextPage(Document doc) throws IOException {`
			`Element pagination = doc.select("div.pagination").first();`
			`if (pagination.hasAttr("next-page-url")) {`
			`return Http.url(pagination.attr("abs:next-page-url")).cookies(cookies).get();`
Fix style 2017-06-19 19:32:57 +02:00			`} else {`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`return null;`
			`}`
			`}`
			`}`