ImgScroll/src/main/java/com/rarchives/ripme/ripper/rippers/SankakuComplexRipper.java

package com.rarchives.ripme.ripper.rippers;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.utils.Http;

public class SankakuComplexRipper extends AbstractHTMLRipper {
    private Document albumDoc = null;
    private Map<String,String> cookies = new HashMap<String,String>();

    public SankakuComplexRipper(URL url) throws IOException {
        super(url);
    }

    @Override
    public String getHost() {
        return "sankakucomplex";
    }

    @Override
    public String getDomain() {
        return "sankakucomplex.com";
    }

    @Override
    public String getGID(URL url) throws MalformedURLException {
        Pattern p = Pattern.compile("^https?://([a-zA-Z0-9]+\\.)?sankakucomplex\\.com/.*tags=([^&]+).*$");
        Matcher m = p.matcher(url.toExternalForm());
        if (m.matches()) {
            try {
                return URLDecoder.decode(m.group(1), "UTF-8");
            } catch (UnsupportedEncodingException e) {
                throw new MalformedURLException("Cannot decode tag name '" + m.group(1) + "'");
            }
        }
        throw new MalformedURLException("Expected sankakucomplex.com URL format: " +
                        "idol.sankakucomplex.com?...&tags=something... - got " +
                        url + "instead");
    }

    @Override
    public Document getFirstPage() throws IOException {
        if (albumDoc == null) {
            Response resp = Http.url(url).response();
            cookies.putAll(resp.cookies());
            albumDoc = resp.parse();
        }
        return albumDoc;
    }

    @Override
    public List<String> getURLsFromPage(Document doc) {
        List<String> imageURLs = new ArrayList<String>();
        // Image URLs are basically thumbnail URLs with a different domain, a simple
        // path replacement, and a ?xxxxxx post ID at the end (obtainable from the href)
        for (Element thumbSpan : doc.select("div.content > div > span.thumb > a")) {
            String postLink = thumbSpan.attr("href");
                try {
                    // Get the page the full sized image is on
                    Document subPage = Http.url("https://chan.sankakucomplex.com" + postLink).get();
                    logger.info("Checking page " + "https://chan.sankakucomplex.com" + postLink);
                    imageURLs.add("https:" + subPage.select("div[id=post-content] > a > img").attr("src"));
                } catch (IOException e) {
                    logger.warn("Error while loading page " + postLink, e);
                    continue;
                }
        }
        return imageURLs;
    }

    @Override
    public void downloadURL(URL url, int index) {
        sleep(8000);
        addURLToDownload(url, getPrefix(index));
    }

    @Override
    public Document getNextPage(Document doc) throws IOException {
        Element pagination = doc.select("div.pagination").first();
        if (pagination.hasAttr("next-page-url")) {
            String nextPage = pagination.attr("abs:next-page-url");
            // Only logged in users can see past page 25
            // Trying to rip page 26 will throw a no images found error
            if (nextPage.contains("page=26") == false) {
                logger.info("Getting next page: " + pagination.attr("abs:next-page-url"));
                return Http.url(pagination.attr("abs:next-page-url")).cookies(cookies).get();
            }
        }
        throw new IOException("No more pages");
    }
}
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`package com.rarchives.ripme.ripper.rippers;`

			`import java.io.IOException;`
			`import java.io.UnsupportedEncodingException;`
			`import java.net.MalformedURLException;`
			`import java.net.URL;`
			`import java.net.URLDecoder;`
			`import java.util.ArrayList;`
			`import java.util.HashMap;`
			`import java.util.List;`
			`import java.util.Map;`
			`import java.util.regex.Matcher;`
			`import java.util.regex.Pattern;`

			`import org.jsoup.Connection.Response;`
			`import org.jsoup.nodes.Document;`
			`import org.jsoup.nodes.Element;`

			`import com.rarchives.ripme.ripper.AbstractHTMLRipper;`
			`import com.rarchives.ripme.utils.Http;`

			`public class SankakuComplexRipper extends AbstractHTMLRipper {`
			`private Document albumDoc = null;`
			`private Map<String,String> cookies = new HashMap<String,String>();`

			`public SankakuComplexRipper(URL url) throws IOException {`
			`super(url);`
			`}`

			`@Override`
			`public String getHost() {`
			`return "sankakucomplex";`
			`}`
Fix style 2017-06-19 19:32:57 +02:00
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`@Override`
			`public String getDomain() {`
Various fixes to tests: Ability to set log level, lots of debugging messages Turn on debug logging during tests, simplified test cases for HTML ripper Fix fusktator ripper, added test Fixed gifyo, added test Added tests for all rippers Adding a few album-guessing URLs 2015-02-10 08:29:29 +01:00			`return "sankakucomplex.com";`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`}`

			`@Override`
			`public String getGID(URL url) throws MalformedURLException {`
Various fixes to tests: Ability to set log level, lots of debugging messages Turn on debug logging during tests, simplified test cases for HTML ripper Fix fusktator ripper, added test Fixed gifyo, added test Added tests for all rippers Adding a few album-guessing URLs 2015-02-10 08:29:29 +01:00			`Pattern p = Pattern.compile("^https?://([a-zA-Z0-9]+\\.)?sankakucomplex\\.com/.tags=([^&]+).$");`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`Matcher m = p.matcher(url.toExternalForm());`
			`if (m.matches()) {`
			`try {`
			`return URLDecoder.decode(m.group(1), "UTF-8");`
			`} catch (UnsupportedEncodingException e) {`
			`throw new MalformedURLException("Cannot decode tag name '" + m.group(1) + "'");`
			`}`
			`}`
			`throw new MalformedURLException("Expected sankakucomplex.com URL format: " +`
			`"idol.sankakucomplex.com?...&tags=something... - got " +`
			`url + "instead");`
			`}`
Fix style 2017-06-19 19:32:57 +02:00
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`@Override`
			`public Document getFirstPage() throws IOException {`
			`if (albumDoc == null) {`
			`Response resp = Http.url(url).response();`
			`cookies.putAll(resp.cookies());`
			`albumDoc = resp.parse();`
			`}`
			`return albumDoc;`
			`}`
Fix style 2017-06-19 19:32:57 +02:00
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`@Override`
			`public List<String> getURLsFromPage(Document doc) {`
			`List<String> imageURLs = new ArrayList<String>();`
			`// Image URLs are basically thumbnail URLs with a different domain, a simple`
			`// path replacement, and a ?xxxxxx post ID at the end (obtainable from the href)`
Now downloads full sized images 2017-10-14 03:32:34 +02:00			`for (Element thumbSpan : doc.select("div.content > div > span.thumb > a")) {`
			`String postLink = thumbSpan.attr("href");`
Fixed SankakuComplex Ripper 2017-10-15 01:21:31 +02:00			`try {`
			`// Get the page the full sized image is on`
			`Document subPage = Http.url("https://chan.sankakucomplex.com" + postLink).get();`
			`logger.info("Checking page " + "https://chan.sankakucomplex.com" + postLink);`
			`imageURLs.add("https:" + subPage.select("div[id=post-content] > a > img").attr("src"));`
			`} catch (IOException e) {`
			`logger.warn("Error while loading page " + postLink, e);`
			`continue;`
			`}`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`}`
			`return imageURLs;`
			`}`
Fix style 2017-06-19 19:32:57 +02:00
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`@Override`
			`public void downloadURL(URL url, int index) {`
Fixed SankakuComplex Ripper 2017-10-15 01:21:31 +02:00			`sleep(8000);`
Now downloads full sized images 2017-10-14 03:32:34 +02:00			`addURLToDownload(url, getPrefix(index));`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`}`

			`@Override`
			`public Document getNextPage(Document doc) throws IOException {`
			`Element pagination = doc.select("div.pagination").first();`
			`if (pagination.hasAttr("next-page-url")) {`
Fixed SankakuComplex Ripper 2017-10-15 01:21:31 +02:00			`String nextPage = pagination.attr("abs:next-page-url");`
			`// Only logged in users can see past page 25`
			`// Trying to rip page 26 will throw a no images found error`
			`if (nextPage.contains("page=26") == false) {`
			`logger.info("Getting next page: " + pagination.attr("abs:next-page-url"));`
			`return Http.url(pagination.attr("abs:next-page-url")).cookies(cookies).get();`
			`}`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`}`
Fixed SankakuComplex Ripper 2017-10-15 01:21:31 +02:00			`throw new IOException("No more pages");`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`}`
			`}`