ImgScroll/src/main/java/com/rarchives/ripme/ripper/rippers/SankakuComplexRipper.java

package com.rarchives.ripme.ripper.rippers;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.utils.Http;

public class SankakuComplexRipper extends AbstractHTMLRipper {
    private Document albumDoc = null;
    private Map<String,String> cookies = new HashMap<>();

    public SankakuComplexRipper(URL url) throws IOException {
        super(url);
    }

    @Override
    public String getHost() {
        return "sankakucomplex";
    }

    @Override
    public String getDomain() {
        return "sankakucomplex.com";
    }

    @Override
    public String getGID(URL url) throws MalformedURLException {
        Pattern p = Pattern.compile("^https?://([a-zA-Z0-9]+\\.)?sankakucomplex\\.com/.*tags=([^&]+).*$");
        Matcher m = p.matcher(url.toExternalForm());
        if (m.matches()) {
            try {
                return URLDecoder.decode(m.group(1) + "_" + m.group(2), "UTF-8");
            } catch (UnsupportedEncodingException e) {
                throw new MalformedURLException("Cannot decode tag name '" + m.group(1) + "'");
            }
        }
        throw new MalformedURLException("Expected sankakucomplex.com URL format: " +
                        "idol.sankakucomplex.com?...&tags=something... - got " +
                        url + "instead");
    }

    public String getSubDomain(URL url){
        Pattern p = Pattern.compile("^https?://([a-zA-Z0-9]+\\.)?sankakucomplex\\.com/.*tags=([^&]+).*$");
        Matcher m = p.matcher(url.toExternalForm());
        if (m.matches()) {
            try {
                return URLDecoder.decode(m.group(1), "UTF-8");
            } catch (UnsupportedEncodingException e) {
                return null;
            }
        }
        return null;

    }

    @Override
    public Document getFirstPage() throws IOException {
        if (albumDoc == null) {
            Response resp = Http.url(url).response();
            cookies.putAll(resp.cookies());
            albumDoc = resp.parse();
        }
        return albumDoc;
    }

    @Override
    public List<String> getURLsFromPage(Document doc) {
        List<String> imageURLs = new ArrayList<>();
        // Image URLs are basically thumbnail URLs with a different domain, a simple
        // path replacement, and a ?xxxxxx post ID at the end (obtainable from the href)
        for (Element thumbSpan : doc.select("div.content > div > span.thumb > a")) {
            String postLink = thumbSpan.attr("href");
                try {
                    String subDomain = getSubDomain(url);
                    String siteURL = "https://" + subDomain + "sankakucomplex.com";
                    // Get the page the full sized image is on
                    Document subPage = Http.url(siteURL + postLink).get();
                    logger.info("Checking page " + siteURL + postLink);
                    imageURLs.add("https:" + subPage.select("div[id=stats] > ul > li > a[id=highres]").attr("href"));
                } catch (IOException e) {
                    logger.warn("Error while loading page " + postLink, e);
                }
        }
        return imageURLs;
    }

    @Override
    public void downloadURL(URL url, int index) {
        sleep(8000);
        addURLToDownload(url, getPrefix(index));
    }

    @Override
    public Document getNextPage(Document doc) throws IOException {
        Element pagination = doc.select("div.pagination").first();
        if (pagination.hasAttr("next-page-url")) {
            String nextPage = pagination.attr("abs:next-page-url");
            // Only logged in users can see past page 25
            // Trying to rip page 26 will throw a no images found error
            if (!nextPage.contains("page=26")) {
                logger.info("Getting next page: " + pagination.attr("abs:next-page-url"));
                return Http.url(pagination.attr("abs:next-page-url")).cookies(cookies).get();
            }
        }
        throw new IOException("No more pages");
    }
}
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`package com.rarchives.ripme.ripper.rippers;`

			`import java.io.IOException;`
			`import java.io.UnsupportedEncodingException;`
			`import java.net.MalformedURLException;`
			`import java.net.URL;`
			`import java.net.URLDecoder;`
			`import java.util.ArrayList;`
			`import java.util.HashMap;`
			`import java.util.List;`
			`import java.util.Map;`
			`import java.util.regex.Matcher;`
			`import java.util.regex.Pattern;`

			`import org.jsoup.Connection.Response;`
			`import org.jsoup.nodes.Document;`
			`import org.jsoup.nodes.Element;`

			`import com.rarchives.ripme.ripper.AbstractHTMLRipper;`
			`import com.rarchives.ripme.utils.Http;`

			`public class SankakuComplexRipper extends AbstractHTMLRipper {`
			`private Document albumDoc = null;`
Update to Java 8 * Changed the Maven target to 1.8 * Performed a preliminary cleanup using IntelliJ's Code Analysis (Only Java 7/8 updates and a few other entries in the Error and Warnings categories) * Updated the readme to change the required Java version 2017-10-24 16:33:28 +02:00			`private Map<String,String> cookies = new HashMap<>();`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00
			`public SankakuComplexRipper(URL url) throws IOException {`
			`super(url);`
			`}`

			`@Override`
			`public String getHost() {`
			`return "sankakucomplex";`
			`}`
Fix style 2017-06-19 19:32:57 +02:00
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`@Override`
			`public String getDomain() {`
Various fixes to tests: Ability to set log level, lots of debugging messages Turn on debug logging during tests, simplified test cases for HTML ripper Fix fusktator ripper, added test Fixed gifyo, added test Added tests for all rippers Adding a few album-guessing URLs 2015-02-10 08:29:29 +01:00			`return "sankakucomplex.com";`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`}`

			`@Override`
			`public String getGID(URL url) throws MalformedURLException {`
Various fixes to tests: Ability to set log level, lots of debugging messages Turn on debug logging during tests, simplified test cases for HTML ripper Fix fusktator ripper, added test Fixed gifyo, added test Added tests for all rippers Adding a few album-guessing URLs 2015-02-10 08:29:29 +01:00			`Pattern p = Pattern.compile("^https?://([a-zA-Z0-9]+\\.)?sankakucomplex\\.com/.tags=([^&]+).$");`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`Matcher m = p.matcher(url.toExternalForm());`
			`if (m.matches()) {`
			`try {`
SankakuComplexRipper can now download from different subdomains 2018-05-19 16:42:50 +02:00			`return URLDecoder.decode(m.group(1) + "_" + m.group(2), "UTF-8");`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`} catch (UnsupportedEncodingException e) {`
			`throw new MalformedURLException("Cannot decode tag name '" + m.group(1) + "'");`
			`}`
			`}`
			`throw new MalformedURLException("Expected sankakucomplex.com URL format: " +`
			`"idol.sankakucomplex.com?...&tags=something... - got " +`
			`url + "instead");`
			`}`
Fix style 2017-06-19 19:32:57 +02:00
SankakuComplexRipper can now download from different subdomains 2018-05-19 16:42:50 +02:00			`public String getSubDomain(URL url){`
			`Pattern p = Pattern.compile("^https?://([a-zA-Z0-9]+\\.)?sankakucomplex\\.com/.tags=([^&]+).$");`
			`Matcher m = p.matcher(url.toExternalForm());`
			`if (m.matches()) {`
			`try {`
			`return URLDecoder.decode(m.group(1), "UTF-8");`
			`} catch (UnsupportedEncodingException e) {`
			`return null;`
			`}`
			`}`
			`return null;`

			`}`

Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`@Override`
			`public Document getFirstPage() throws IOException {`
			`if (albumDoc == null) {`
			`Response resp = Http.url(url).response();`
			`cookies.putAll(resp.cookies());`
			`albumDoc = resp.parse();`
			`}`
			`return albumDoc;`
			`}`
Fix style 2017-06-19 19:32:57 +02:00
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`@Override`
			`public List<String> getURLsFromPage(Document doc) {`
Update to Java 8 * Changed the Maven target to 1.8 * Performed a preliminary cleanup using IntelliJ's Code Analysis (Only Java 7/8 updates and a few other entries in the Error and Warnings categories) * Updated the readme to change the required Java version 2017-10-24 16:33:28 +02:00			`List<String> imageURLs = new ArrayList<>();`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`// Image URLs are basically thumbnail URLs with a different domain, a simple`
			`// path replacement, and a ?xxxxxx post ID at the end (obtainable from the href)`
Now downloads full sized images 2017-10-14 03:32:34 +02:00			`for (Element thumbSpan : doc.select("div.content > div > span.thumb > a")) {`
			`String postLink = thumbSpan.attr("href");`
Fixed SankakuComplex Ripper 2017-10-15 01:21:31 +02:00			`try {`
SankakuComplexRipper can now download from different subdomains 2018-05-19 16:42:50 +02:00			`String subDomain = getSubDomain(url);`
			`String siteURL = "https://" + subDomain + "sankakucomplex.com";`
Fixed SankakuComplex Ripper 2017-10-15 01:21:31 +02:00			`// Get the page the full sized image is on`
SankakuComplexRipper can now download from different subdomains 2018-05-19 16:42:50 +02:00			`Document subPage = Http.url(siteURL + postLink).get();`
			`logger.info("Checking page " + siteURL + postLink);`
SankakuComplexRipper now downloads full sized images (#328) 2017-12-11 03:59:08 +01:00			`imageURLs.add("https:" + subPage.select("div[id=stats] > ul > li > a[id=highres]").attr("href"));`
Fixed SankakuComplex Ripper 2017-10-15 01:21:31 +02:00			`} catch (IOException e) {`
			`logger.warn("Error while loading page " + postLink, e);`
			`}`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`}`
			`return imageURLs;`
			`}`
Fix style 2017-06-19 19:32:57 +02:00
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`@Override`
			`public void downloadURL(URL url, int index) {`
Fixed SankakuComplex Ripper 2017-10-15 01:21:31 +02:00			`sleep(8000);`
Now downloads full sized images 2017-10-14 03:32:34 +02:00			`addURLToDownload(url, getPrefix(index));`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`}`

			`@Override`
			`public Document getNextPage(Document doc) throws IOException {`
			`Element pagination = doc.select("div.pagination").first();`
			`if (pagination.hasAttr("next-page-url")) {`
Fixed SankakuComplex Ripper 2017-10-15 01:21:31 +02:00			`String nextPage = pagination.attr("abs:next-page-url");`
			`// Only logged in users can see past page 25`
			`// Trying to rip page 26 will throw a no images found error`
Update to Java 8 * Changed the Maven target to 1.8 * Performed a preliminary cleanup using IntelliJ's Code Analysis (Only Java 7/8 updates and a few other entries in the Error and Warnings categories) * Updated the readme to change the required Java version 2017-10-24 16:33:28 +02:00			`if (!nextPage.contains("page=26")) {`
Fixed SankakuComplex Ripper 2017-10-15 01:21:31 +02:00			`logger.info("Getting next page: " + pagination.attr("abs:next-page-url"));`
			`return Http.url(pagination.attr("abs:next-page-url")).cookies(cookies).get();`
			`}`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`}`
Fixed SankakuComplex Ripper 2017-10-15 01:21:31 +02:00			`throw new IOException("No more pages");`
Add scraper for idol.sankakucomplex.com tag albums. 2014-10-15 07:27:51 +02:00			`}`
			`}`