ImgScroll/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java

package com.rarchives.ripme.ripper;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;

import org.jsoup.nodes.Document;

import com.rarchives.ripme.ui.RipStatusMessage.STATUS;
import com.rarchives.ripme.utils.Utils;

/**
 * Simplified ripper, designed for ripping from sites by parsing HTML.
 */
public abstract class AbstractHTMLRipper extends AlbumRipper {
	
    public AbstractHTMLRipper(URL url) throws IOException {
        super(url);
    }

    public abstract String getDomain();
    public abstract String getHost();

    public abstract Document getFirstPage() throws IOException;
    public Document getNextPage(Document doc) throws IOException {
        return null;
    }
    public abstract List<String> getURLsFromPage(Document page);
    public List<String> getDescriptionsFromPage(Document doc) throws IOException {
    	throw new IOException("getDescriptionsFromPage not implemented"); // Do I do this or make an abstract function?
    }
    public abstract void downloadURL(URL url, int index);
    public DownloadThreadPool getThreadPool() {
        return null;
    }

    public boolean keepSortOrder() {
        return true;
    }

    @Override
    public boolean canRip(URL url) {
        return url.getHost().endsWith(getDomain());
    }
    
    @Override
    public URL sanitizeURL(URL url) throws MalformedURLException {
        return url;
    }
    public boolean hasDescriptionSupport() {
		return false;
    }
    public String getDescription(String page) throws IOException {
    	throw new IOException("getDescription not implemented"); // Do I do this or make an abstract function?
    }
    public int descSleepTime() {
        return 0;
    }
    @Override
    public void rip() throws IOException {
        int index = 0;
        int textindex = 0;
        logger.info("Retrieving " + this.url);
        sendUpdate(STATUS.LOADING_RESOURCE, this.url.toExternalForm());
        Document doc = getFirstPage();
        
        while (doc != null) {
            List<String> imageURLs = getURLsFromPage(doc);
            // Remove all but 1 image
            if (isThisATest()) {
                while (imageURLs.size() > 1) {
                    imageURLs.remove(1);
                }
            }

            if (imageURLs.size() == 0) {
                throw new IOException("No images found at " + doc.location());
            }
            
            for (String imageURL : imageURLs) {
                index += 1;
                logger.debug("Found image url #" + index + ": " + imageURL);
                downloadURL(new URL(imageURL), index);
                if (isStopped()) {
                    break;
                }
            }
            if (hasDescriptionSupport()) {
                logger.debug("Fetching description(s) from " + doc.location());
            	List<String> textURLs = getDescriptionsFromPage(doc);
            	if (textURLs.size() > 0) {
                    logger.debug("Found description link(s) from " + doc.location());
            		for (String textURL : textURLs) {
            			if (isStopped()) {
            				break;
            			}
            			textindex += 1;
            			logger.debug("Getting description from " + textURL);
                        sleep(descSleepTime());
            			String tempDesc = getDescription(textURL);
            			if (tempDesc != null) {
            			    logger.debug("Got description: " + tempDesc);
            				saveText(new URL(textURL), "", tempDesc, textindex);
            			}
            		}
            	}
            }

            if (isStopped() || isThisATest()) {
                break;
            }

            try {
                sendUpdate(STATUS.LOADING_RESOURCE, "next page");
                doc = getNextPage(doc);
            } catch (IOException e) {
                logger.info("Can't get next page: " + e.getMessage());
                break;
            }
        }

        // If they're using a thread pool, wait for it.
        if (getThreadPool() != null) {
            logger.debug("Waiting for threadpool " + getThreadPool().getClass().getName());
            getThreadPool().waitForThreads();
        }
        waitForThreads();
    }
    public boolean saveText(URL url, String subdirectory, String text, int index) {
        // Not the best for some cases, like FurAffinity. Overridden there.
        try {
            stopCheck();
        } catch (IOException e) {
            return false;
        }
        String saveAs = url.toExternalForm();
        saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1);
        if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); }
        if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); }
        if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); }
        if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); }
        File saveFileAs;
        try {
            if (!subdirectory.equals("")) { // Not sure about this part
                subdirectory = File.separator + subdirectory;
            }
            // TODO Get prefix working again, probably requires reworking a lot of stuff! (Might be fixed now)
            saveFileAs = new File(
                    workingDir.getCanonicalPath()
                    + subdirectory
                    + File.separator
                    + getPrefix(index)
                    + saveAs
                    + ".txt");
            // Write the file
            FileOutputStream out = (new FileOutputStream(saveFileAs));
            out.write(text.getBytes());
            out.close();
        } catch (IOException e) {
            logger.error("[!] Error creating save file path for description '" + url + "':", e);
            return false;
        }
        logger.debug("Downloading " + url + "'s description to " + saveFileAs);
        if (!saveFileAs.getParentFile().exists()) {
            logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent()));
            saveFileAs.getParentFile().mkdirs();
        }
        return true;
    }
    public String getPrefix(int index) {
        String prefix = "";
        if (keepSortOrder() && Utils.getConfigBoolean("download.save_order", true)) {
            prefix = String.format("%03d_", index);
        }
        return prefix;
    }
}
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`package com.rarchives.ripme.ripper;`

Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00			`import java.io.File;`
			`import java.io.FileOutputStream;`
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`import java.io.IOException;`
			`import java.net.MalformedURLException;`
			`import java.net.URL;`
			`import java.util.List;`

			`import org.jsoup.nodes.Document;`

			`import com.rarchives.ripme.ui.RipStatusMessage.STATUS;`
			`import com.rarchives.ripme.utils.Utils;`

Removed SinglePage ripper 2014-06-23 04:17:40 +02:00			`/**`
			`* Simplified ripper, designed for ripping from sites by parsing HTML.`
			`*/`
Moving from MultiPage ripper to HTML ripper, added JSON ripper 2014-06-23 04:12:29 +02:00			`public abstract class AbstractHTMLRipper extends AlbumRipper {`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00
Moving from MultiPage ripper to HTML ripper, added JSON ripper 2014-06-23 04:12:29 +02:00			`public AbstractHTMLRipper(URL url) throws IOException {`
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`super(url);`
			`}`

			`public abstract String getDomain();`
			`public abstract String getHost();`

			`public abstract Document getFirstPage() throws IOException;`
Removed SinglePage ripper 2014-06-23 04:17:40 +02:00			`public Document getNextPage(Document doc) throws IOException {`
Attempt to remove transient failure of tests 2015-02-06 12:01:02 +01:00			`return null;`
Removed SinglePage ripper 2014-06-23 04:17:40 +02:00			`}`
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`public abstract List<String> getURLsFromPage(Document page);`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00			`public List<String> getDescriptionsFromPage(Document doc) throws IOException {`
			`throw new IOException("getDescriptionsFromPage not implemented"); // Do I do this or make an abstract function?`
			`}`
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`public abstract void downloadURL(URL url, int index);`
Moving from MultiPage ripper to HTML ripper, added JSON ripper 2014-06-23 04:12:29 +02:00			`public DownloadThreadPool getThreadPool() {`
			`return null;`
			`}`
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00
			`public boolean keepSortOrder() {`
			`return true;`
			`}`

			`@Override`
			`public boolean canRip(URL url) {`
			`return url.getHost().endsWith(getDomain());`
			`}`

			`@Override`
			`public URL sanitizeURL(URL url) throws MalformedURLException {`
			`return url;`
			`}`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00			`public boolean hasDescriptionSupport() {`
			`return false;`
			`}`
			`public String getDescription(String page) throws IOException {`
			`throw new IOException("getDescription not implemented"); // Do I do this or make an abstract function?`
			`}`
Added FurAffinity Description Ripping Also added description sleep time value to AbstractHTMLRipper, to avoid read timed out. 2015-05-29 20:26:48 +02:00			`public int descSleepTime() {`
			`return 0;`
			`}`
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`@Override`
			`public void rip() throws IOException {`
			`int index = 0;`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00			`int textindex = 0;`
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`logger.info("Retrieving " + this.url);`
			`sendUpdate(STATUS.LOADING_RESOURCE, this.url.toExternalForm());`
			`Document doc = getFirstPage();`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`while (doc != null) {`
			`List<String> imageURLs = getURLsFromPage(doc);`
Various fixes to tests: Ability to set log level, lots of debugging messages Turn on debug logging during tests, simplified test cases for HTML ripper Fix fusktator ripper, added test Fixed gifyo, added test Added tests for all rippers Adding a few album-guessing URLs 2015-02-10 08:29:29 +01:00			`// Remove all but 1 image`
			`if (isThisATest()) {`
			`while (imageURLs.size() > 1) {`
			`imageURLs.remove(1);`
			`}`
			`}`
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00
			`if (imageURLs.size() == 0) {`
1.0.78 - Added BCFakes ripper #8 2014-07-20 10:31:45 +02:00			`throw new IOException("No images found at " + doc.location());`
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`}`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`for (String imageURL : imageURLs) {`
Better integration tests, testing out TravisCI 2015-02-06 08:58:17 +01:00			`index += 1;`
Various fixes to tests: Ability to set log level, lots of debugging messages Turn on debug logging during tests, simplified test cases for HTML ripper Fix fusktator ripper, added test Fixed gifyo, added test Added tests for all rippers Adding a few album-guessing URLs 2015-02-10 08:29:29 +01:00			`logger.debug("Found image url #" + index + ": " + imageURL);`
Better integration tests, testing out TravisCI 2015-02-06 08:58:17 +01:00			`downloadURL(new URL(imageURL), index);`
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`if (isStopped()) {`
			`break;`
			`}`
			`}`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00			`if (hasDescriptionSupport()) {`
Various fixes to tests: Ability to set log level, lots of debugging messages Turn on debug logging during tests, simplified test cases for HTML ripper Fix fusktator ripper, added test Fixed gifyo, added test Added tests for all rippers Adding a few album-guessing URLs 2015-02-10 08:29:29 +01:00			`logger.debug("Fetching description(s) from " + doc.location());`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00			`List<String> textURLs = getDescriptionsFromPage(doc);`
			`if (textURLs.size() > 0) {`
Added FurAffinity Description Ripping Also added description sleep time value to AbstractHTMLRipper, to avoid read timed out. 2015-05-29 20:26:48 +02:00			`logger.debug("Found description link(s) from " + doc.location());`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00			`for (String textURL : textURLs) {`
			`if (isStopped()) {`
			`break;`
			`}`
			`textindex += 1;`
Added FurAffinity Description Ripping Also added description sleep time value to AbstractHTMLRipper, to avoid read timed out. 2015-05-29 20:26:48 +02:00			`logger.debug("Getting description from " + textURL);`
			`sleep(descSleepTime());`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00			`String tempDesc = getDescription(textURL);`
			`if (tempDesc != null) {`
Various fixes to tests: Ability to set log level, lots of debugging messages Turn on debug logging during tests, simplified test cases for HTML ripper Fix fusktator ripper, added test Fixed gifyo, added test Added tests for all rippers Adding a few album-guessing URLs 2015-02-10 08:29:29 +01:00			`logger.debug("Got description: " + tempDesc);`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00			`saveText(new URL(textURL), "", tempDesc, textindex);`
			`}`
			`}`
			`}`
			`}`
Moving from MultiPage ripper to HTML ripper, added JSON ripper 2014-06-23 04:12:29 +02:00
Various fixes to tests: Ability to set log level, lots of debugging messages Turn on debug logging during tests, simplified test cases for HTML ripper Fix fusktator ripper, added test Fixed gifyo, added test Added tests for all rippers Adding a few album-guessing URLs 2015-02-10 08:29:29 +01:00			`if (isStopped() \|\| isThisATest()) {`
Moving from MultiPage ripper to HTML ripper, added JSON ripper 2014-06-23 04:12:29 +02:00			`break;`
			`}`

Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`try {`
Moving from MultiPage ripper to HTML ripper, added JSON ripper 2014-06-23 04:12:29 +02:00			`sendUpdate(STATUS.LOADING_RESOURCE, "next page");`
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`doc = getNextPage(doc);`
			`} catch (IOException e) {`
			`logger.info("Can't get next page: " + e.getMessage());`
			`break;`
			`}`
			`}`
Moving from MultiPage ripper to HTML ripper, added JSON ripper 2014-06-23 04:12:29 +02:00
			`// If they're using a thread pool, wait for it.`
			`if (getThreadPool() != null) {`
Various fixes to tests: Ability to set log level, lots of debugging messages Turn on debug logging during tests, simplified test cases for HTML ripper Fix fusktator ripper, added test Fixed gifyo, added test Added tests for all rippers Adding a few album-guessing URLs 2015-02-10 08:29:29 +01:00			`logger.debug("Waiting for threadpool " + getThreadPool().getClass().getName());`
Moving from MultiPage ripper to HTML ripper, added JSON ripper 2014-06-23 04:12:29 +02:00			`getThreadPool().waitForThreads();`
			`}`
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`waitForThreads();`
			`}`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00			`public boolean saveText(URL url, String subdirectory, String text, int index) {`
Added FurAffinity Description Ripping Also added description sleep time value to AbstractHTMLRipper, to avoid read timed out. 2015-05-29 20:26:48 +02:00			`// Not the best for some cases, like FurAffinity. Overridden there.`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00			`try {`
			`stopCheck();`
			`} catch (IOException e) {`
			`return false;`
			`}`
			`String saveAs = url.toExternalForm();`
			`saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1);`
			`if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); }`
			`if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); }`
			`if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); }`
			`if (saveAs.indexOf(':') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf(':')); }`
			`File saveFileAs;`
			`try {`
			`if (!subdirectory.equals("")) { // Not sure about this part`
			`subdirectory = File.separator + subdirectory;`
			`}`
Added FurAffinity Description Ripping Also added description sleep time value to AbstractHTMLRipper, to avoid read timed out. 2015-05-29 20:26:48 +02:00			`// TODO Get prefix working again, probably requires reworking a lot of stuff! (Might be fixed now)`
Improved description ripping for deviantART Also, added some functions to AbstractHTMLRipper that should allow description ripping to be added to other rippers as well. hasDescriptionSupport() is a function that will be overridden by a ripper that supports descriptions, and will trigger the description ripper. getDescription will grab the description from a page, and must be overridden if you want to grab a description with a ripper. 2014-11-29 05:59:39 +01:00			`saveFileAs = new File(`
			`workingDir.getCanonicalPath()`
			`+ subdirectory`
			`+ File.separator`
			`+ getPrefix(index)`
			`+ saveAs`
			`+ ".txt");`
			`// Write the file`
			`FileOutputStream out = (new FileOutputStream(saveFileAs));`
			`out.write(text.getBytes());`
			`out.close();`
			`} catch (IOException e) {`
			`logger.error("[!] Error creating save file path for description '" + url + "':", e);`
			`return false;`
			`}`
			`logger.debug("Downloading " + url + "'s description to " + saveFileAs);`
			`if (!saveFileAs.getParentFile().exists()) {`
			`logger.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent()));`
			`saveFileAs.getParentFile().mkdirs();`
			`}`
			`return true;`
			`}`
Using new wrapper for HTTP requests, started abstract classes to simplify rippers 2014-06-22 02:08:42 +02:00			`public String getPrefix(int index) {`
			`String prefix = "";`
			`if (keepSortOrder() && Utils.getConfigBoolean("download.save_order", true)) {`
			`prefix = String.format("%03d_", index);`
			`}`
			`return prefix;`
			`}`
Update AbstractHTMLRipper.java 2015-10-15 21:27:34 +02:00			`}`