2014-03-13 03:29:31 +01:00
package com.rarchives.ripme.ripper.rippers ;
import java.io.IOException ;
import java.net.MalformedURLException ;
import java.net.URL ;
2014-06-23 04:12:29 +02:00
import java.util.ArrayList ;
import java.util.Collections ;
import java.util.HashMap ;
import java.util.List ;
import java.util.Map ;
2014-03-13 03:29:31 +01:00
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
2014-06-23 04:12:29 +02:00
import org.jsoup.Connection.Response ;
2014-03-13 03:29:31 +01:00
import org.jsoup.nodes.Document ;
import org.jsoup.nodes.Element ;
2014-06-23 04:12:29 +02:00
import org.jsoup.select.Elements ;
2014-03-13 03:29:31 +01:00
2014-06-23 04:17:40 +02:00
import com.rarchives.ripme.ripper.AbstractHTMLRipper ;
2014-06-05 05:23:21 +02:00
import com.rarchives.ripme.ui.RipStatusMessage.STATUS ;
2014-06-22 02:08:42 +02:00
import com.rarchives.ripme.utils.Http ;
2014-03-13 03:29:31 +01:00
2014-06-23 04:17:40 +02:00
public class EightmusesRipper extends AbstractHTMLRipper {
2014-03-13 03:29:31 +01:00
2014-06-05 05:23:21 +02:00
private Document albumDoc = null ;
2014-06-23 04:12:29 +02:00
private Map < String , String > cookies = new HashMap < String , String > ( ) ;
2014-06-05 05:23:21 +02:00
2014-03-13 03:29:31 +01:00
public EightmusesRipper ( URL url ) throws IOException {
super ( url ) ;
}
@Override
2014-06-23 04:12:29 +02:00
public String getHost ( ) {
return " 8muses " ;
}
@Override
public String getDomain ( ) {
return " 8muses.com " ;
2014-03-13 03:29:31 +01:00
}
@Override
2014-06-23 04:12:29 +02:00
public String getGID ( URL url ) throws MalformedURLException {
Pattern p = Pattern . compile ( " ^https?://(www \\ .)?8muses \\ .com/index/category/([a-zA-Z0-9 \\ -_]+).*$ " ) ;
Matcher m = p . matcher ( url . toExternalForm ( ) ) ;
if ( ! m . matches ( ) ) {
throw new MalformedURLException ( " Expected URL format: http://www.8muses.com/index/category/albumname, got: " + url ) ;
}
return m . group ( m . groupCount ( ) ) ;
2014-03-13 03:29:31 +01:00
}
2014-06-23 04:12:29 +02:00
2014-06-05 05:23:21 +02:00
@Override
public String getAlbumTitle ( URL url ) throws MalformedURLException {
try {
// Attempt to use album title as GID
2014-06-23 04:12:29 +02:00
Element titleElement = getFirstPage ( ) . select ( " meta[name=description] " ) . first ( ) ;
2014-06-05 05:23:21 +02:00
String title = titleElement . attr ( " content " ) ;
2017-02-21 21:39:52 +01:00
title = title . replace ( " A huge collection of free porn comics for adults. Read " , " " ) ;
title = title . replace ( " online for free at 8muses.com " , " " ) ;
2014-06-23 04:12:29 +02:00
return getHost ( ) + " _ " + title . trim ( ) ;
2014-06-05 05:23:21 +02:00
} catch ( IOException e ) {
// Fall back to default album naming convention
logger . info ( " Unable to find title at " + url ) ;
}
return super . getAlbumTitle ( url ) ;
}
2014-03-13 03:29:31 +01:00
@Override
2014-06-23 04:12:29 +02:00
public Document getFirstPage ( ) throws IOException {
2014-06-05 05:23:21 +02:00
if ( albumDoc = = null ) {
2014-06-23 04:12:29 +02:00
Response resp = Http . url ( url ) . response ( ) ;
cookies . putAll ( resp . cookies ( ) ) ;
albumDoc = resp . parse ( ) ;
2014-06-05 05:23:21 +02:00
}
2014-06-23 04:12:29 +02:00
return albumDoc ;
}
2014-06-05 05:23:21 +02:00
2014-06-23 04:12:29 +02:00
@Override
public List < String > getURLsFromPage ( Document page ) {
List < String > imageURLs = new ArrayList < String > ( ) ;
if ( page . select ( " .preview > span " ) . size ( ) > 0 ) {
2014-06-05 05:23:21 +02:00
// Page contains subalbums (not images)
2014-06-23 04:12:29 +02:00
Elements albumElements = page . select ( " a.preview " ) ;
List < Element > albumsList = albumElements . subList ( 0 , albumElements . size ( ) ) ;
Collections . reverse ( albumsList ) ;
// Iterate over elements in reverse order
for ( Element subalbum : albumsList ) {
String subUrl = subalbum . attr ( " href " ) ;
subUrl = subUrl . replaceAll ( " \\ . \\ ./ " , " " ) ;
if ( subUrl . startsWith ( " // " ) ) {
subUrl = " http: " ;
}
else if ( ! subUrl . startsWith ( " http:// " ) ) {
subUrl = " http://www.8muses.com/ " + subUrl ;
}
try {
logger . info ( " Retrieving " + subUrl ) ;
sendUpdate ( STATUS . LOADING_RESOURCE , subUrl ) ;
Document subPage = Http . url ( subUrl ) . get ( ) ;
// Get all images in subalbum, add to list.
List < String > subalbumImages = getURLsFromPage ( subPage ) ;
logger . info ( " Found " + subalbumImages . size ( ) + " images in subalbum " ) ;
imageURLs . addAll ( subalbumImages ) ;
} catch ( IOException e ) {
logger . warn ( " Error while loading subalbum " + subUrl , e ) ;
continue ;
}
2014-03-13 03:29:31 +01:00
}
2014-06-05 05:23:21 +02:00
}
else {
// Page contains images
2016-12-16 06:03:16 +01:00
for ( Element thumb : page . select ( " .image " ) ) {
2015-09-17 11:33:06 +02:00
if ( super . isStopped ( ) ) break ;
2014-06-23 04:12:29 +02:00
// Find thumbnail image source
String image = null ;
if ( thumb . hasAttr ( " data-cfsrc " ) ) {
image = thumb . attr ( " data-cfsrc " ) ;
}
else {
2015-09-17 11:33:06 +02:00
String parentHref = thumb . parent ( ) . attr ( " href " ) ;
if ( parentHref . equals ( " " ) ) continue ;
if ( parentHref . startsWith ( " / " ) ) {
parentHref = " https://www.8muses.com " + parentHref ;
}
try {
logger . info ( " Retrieving full-size image location from " + parentHref ) ;
image = getFullSizeImage ( parentHref ) ;
} catch ( IOException e ) {
logger . error ( " Failed to get full-size image from " + parentHref ) ;
continue ;
}
2014-06-23 04:12:29 +02:00
}
2015-02-06 20:50:24 +01:00
if ( ! image . contains ( " 8muses.com " ) ) {
2016-12-16 06:03:16 +01:00
// Not hosted on 8muses.
2015-02-06 20:50:24 +01:00
continue ;
}
2014-06-23 04:12:29 +02:00
imageURLs . add ( image ) ;
2015-09-17 11:33:06 +02:00
if ( isThisATest ( ) ) break ;
2014-05-26 09:31:58 +02:00
}
2014-03-13 03:29:31 +01:00
}
2014-06-23 04:12:29 +02:00
return imageURLs ;
2014-03-13 03:29:31 +01:00
}
2015-09-17 11:33:06 +02:00
private String getFullSizeImage ( String imageUrl ) throws IOException {
2015-09-17 12:11:50 +02:00
sendUpdate ( STATUS . LOADING_RESOURCE , imageUrl ) ;
2016-12-16 06:03:16 +01:00
Document doc = new Http ( imageUrl ) . get ( ) ; // Retrieve the webpage of the image URL
Element fullSizeImage = doc . select ( " .photo " ) . first ( ) ; // Select the "photo" element from the page (there should only be 1)
String path = " https://www.8muses.com/data/fu/ " + fullSizeImage . children ( ) . select ( " #imageName " ) . attr ( " value " ) ; // Append the path to the fullsize image file to the standard prefix
return path ;
2015-09-17 11:33:06 +02:00
}
2014-03-13 03:29:31 +01:00
@Override
2014-06-23 04:12:29 +02:00
public void downloadURL ( URL url , int index ) {
addURLToDownload ( url , getPrefix ( index ) , " " , this . url . toExternalForm ( ) , cookies ) ;
2014-03-13 03:29:31 +01:00
}
@Override
2014-06-23 04:12:29 +02:00
public String getPrefix ( int index ) {
return String . format ( " %03d_ " , index ) ;
2014-03-13 03:29:31 +01:00
}
}