2014-03-13 10:26:55 +01:00
package com.rarchives.ripme.ripper.rippers ;
2018-06-30 03:09:06 +02:00
import com.rarchives.ripme.ripper.AbstractJSONRipper ;
2018-04-14 23:45:56 +02:00
import com.rarchives.ripme.utils.Base64 ;
import com.rarchives.ripme.utils.Http ;
2018-06-10 18:12:51 +02:00
import com.rarchives.ripme.utils.RipUtils ;
2018-04-14 23:45:56 +02:00
import com.rarchives.ripme.utils.Utils ;
2014-03-13 10:26:55 +01:00
import java.io.IOException ;
2017-04-29 21:35:39 +02:00
import java.net.HttpURLConnection ;
2014-03-13 10:26:55 +01:00
import java.net.MalformedURLException ;
import java.net.URL ;
import java.util.ArrayList ;
import java.util.Arrays ;
import java.util.HashMap ;
2014-06-22 02:08:42 +02:00
import java.util.HashSet ;
2014-03-13 10:26:55 +01:00
import java.util.List ;
import java.util.Map ;
2014-06-22 02:08:42 +02:00
import java.util.Set ;
2014-03-13 10:26:55 +01:00
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
2018-06-30 03:09:06 +02:00
import org.json.JSONArray ;
import org.json.JSONObject ;
2014-03-13 10:26:55 +01:00
import org.jsoup.Connection.Response ;
2014-11-30 06:14:57 +01:00
import org.jsoup.Jsoup ;
2014-03-13 10:26:55 +01:00
import org.jsoup.nodes.Document ;
import org.jsoup.nodes.Element ;
2014-05-17 18:50:07 +02:00
import org.jsoup.select.Elements ;
2014-03-13 10:26:55 +01:00
2018-06-30 03:09:06 +02:00
public class DeviantartRipper extends AbstractJSONRipper {
String requestID ;
String galleryID ;
String username ;
String baseApiUrl = " https://www.deviantart.com/dapi/v1/gallery/ " ;
String csrf ;
Map < String , String > pageCookies = new HashMap < > ( ) ;
2014-03-13 10:26:55 +01:00
2015-01-26 06:12:30 +01:00
private static final int PAGE_SLEEP_TIME = 3000 ,
2015-02-08 08:49:21 +01:00
IMAGE_SLEEP_TIME = 2000 ;
2014-03-13 10:26:55 +01:00
2017-10-24 16:33:28 +02:00
private Map < String , String > cookies = new HashMap < > ( ) ;
private Set < String > triedURLs = new HashSet < > ( ) ;
2014-06-01 11:02:36 +02:00
2014-03-13 10:26:55 +01:00
public DeviantartRipper ( URL url ) throws IOException {
super ( url ) ;
}
2018-06-10 18:12:51 +02:00
String loginCookies = " auth=__0f9158aaec09f417b235%3B%221ff79836392a515d154216d919eae573%22; " +
" auth_secure=__41d14dd0da101f411bb0%3B%2281cf2cf9477776162a1172543aae85ce%22; " +
" userinfo=__bf84ac233bfa8ae642e8%3B%7B%22username%22%3A%22grabpy%22%2C%22uniqueid%22%3A%22a0a876aa37dbd4b30e1c80406ee9c280%22%2C%22vd%22%3A%22BbHUXZ%2CBbHUXZ%2CA%2CU%2CA%2C%2CB%2CA%2CB%2CBbHUXZ%2CBbHUdj%2CL%2CL%2CA%2CBbHUdj%2C13%2CA%2CB%2CA%2C%2CA%2CA%2CB%2CA%2CA%2C%2CA%22%2C%22attr%22%3A56%7D " ;
2014-03-13 10:26:55 +01:00
@Override
2014-06-22 02:08:42 +02:00
public String getHost ( ) {
return " deviantart " ;
}
@Override
public String getDomain ( ) {
return " deviantart.com " ;
2014-03-13 10:26:55 +01:00
}
2018-06-30 03:09:06 +02:00
// @Override
// public boolean hasDescriptionSupport() {
// return true;
// }
2014-03-13 10:26:55 +01:00
@Override
public URL sanitizeURL ( URL url ) throws MalformedURLException {
String u = url . toExternalForm ( ) ;
2015-01-26 06:12:30 +01:00
if ( u . replace ( " / " , " " ) . endsWith ( " .deviantart.com " ) ) {
// Root user page, get all albums
if ( ! u . endsWith ( " / " ) ) {
u + = " / " ;
}
u + = " gallery/? " ;
}
2018-06-29 03:04:05 +02:00
Pattern p = Pattern . compile ( " ^https?://www \\ .deviantart \\ .com/([a-zA-Z0-9 \\ -]+)/favou?rites/([0-9]+)/*?$ " ) ;
2016-04-17 06:26:27 +02:00
Matcher m = p . matcher ( url . toExternalForm ( ) ) ;
if ( ! m . matches ( ) ) {
String subdir = " / " ;
if ( u . contains ( " catpath=scraps " ) ) {
subdir = " scraps " ;
}
u = u . replaceAll ( " \\ ?.* " , " ?catpath= " + subdir ) ;
2014-06-22 02:08:42 +02:00
}
2014-03-13 10:26:55 +01:00
return new URL ( u ) ;
}
@Override
2014-06-22 02:08:42 +02:00
public String getGID ( URL url ) throws MalformedURLException {
2018-06-29 03:04:05 +02:00
Pattern p = Pattern . compile ( " ^https?://www \\ .deviantart \\ .com/([a-zA-Z0-9 \\ -]+)(/gallery)?/?( \\ ?.*)?$ " ) ;
2014-06-22 02:08:42 +02:00
Matcher m = p . matcher ( url . toExternalForm ( ) ) ;
if ( m . matches ( ) ) {
// Root gallery
if ( url . toExternalForm ( ) . contains ( " catpath=scraps " ) ) {
return m . group ( 1 ) + " _scraps " ;
}
else {
return m . group ( 1 ) ;
}
}
2018-06-29 03:04:05 +02:00
p = Pattern . compile ( " ^https?://www \\ .deviantart \\ .com/([a-zA-Z0-9 \\ -]+)/gallery/([0-9]+).*$ " ) ;
2014-06-22 02:08:42 +02:00
m = p . matcher ( url . toExternalForm ( ) ) ;
if ( m . matches ( ) ) {
// Subgallery
return m . group ( 1 ) + " _ " + m . group ( 2 ) ;
}
2018-06-29 03:04:05 +02:00
p = Pattern . compile ( " ^https?://www \\ .deviantart \\ .com/([a-zA-Z0-9 \\ -]+)/favou?rites/([0-9]+)/.*?$ " ) ;
2016-04-17 06:26:27 +02:00
m = p . matcher ( url . toExternalForm ( ) ) ;
if ( m . matches ( ) ) {
return m . group ( 1 ) + " _faves_ " + m . group ( 2 ) ;
}
2018-06-29 03:04:05 +02:00
p = Pattern . compile ( " ^https?://www \\ .deviantart \\ .com/([a-zA-Z0-9 \\ -]+)/favou?rites/?$ " ) ;
2014-06-26 09:10:44 +02:00
m = p . matcher ( url . toExternalForm ( ) ) ;
if ( m . matches ( ) ) {
// Subgallery
return m . group ( 1 ) + " _faves " ;
}
2018-06-29 03:04:05 +02:00
throw new MalformedURLException ( " Expected URL format: http://www.deviantart.com/username[/gallery/#####], got: " + url ) ;
2014-06-22 02:08:42 +02:00
}
2014-05-16 07:44:57 +02:00
2018-04-14 23:45:56 +02:00
/ * *
* Gets first page .
* Will determine if login is supplied ,
* if there is a login , then login and add that login cookies .
* Otherwise , just bypass the age gate with an anonymous flag .
* @return
* @throws IOException
* /
2014-06-22 02:08:42 +02:00
@Override
2018-06-30 03:09:06 +02:00
public JSONObject getFirstPage ( ) throws IOException {
2018-04-14 23:45:56 +02:00
2018-06-10 18:12:51 +02:00
// Base64 da login
// username: Z3JhYnB5
// password: ZmFrZXJz
cookies = getDACookies ( ) ;
if ( cookies . isEmpty ( ) ) {
LOGGER . warn ( " Failed to get login cookies " ) ;
2018-04-14 23:45:56 +02:00
cookies . put ( " agegate_state " , " 1 " ) ; // Bypasses the age gate
}
2018-06-15 21:13:13 +02:00
2018-06-30 03:09:06 +02:00
Response res = Http . url ( this . url )
2014-06-22 02:08:42 +02:00
. cookies ( cookies )
2018-06-30 03:09:06 +02:00
. response ( ) ;
Document page = res . parse ( ) ;
JSONObject firstPageJSON = getFirstPageJSON ( page ) ;
requestID = firstPageJSON . getJSONObject ( " dapx " ) . getString ( " requestid " ) ;
2018-07-01 20:55:48 +02:00
galleryID = getGalleryID ( page ) ;
username = getUsername ( page ) ;
2018-06-30 03:09:06 +02:00
csrf = firstPageJSON . getString ( " csrf " ) ;
pageCookies = res . cookies ( ) ;
return requestPage ( 0 , galleryID , username , requestID , csrf , pageCookies ) ;
2014-06-22 02:08:42 +02:00
}
2018-06-30 03:09:06 +02:00
private JSONObject requestPage ( int offset , String galleryID , String username , String requestID , String csfr , Map < String , String > c ) {
LOGGER . debug ( " offset: " + Integer . toString ( offset ) ) ;
LOGGER . debug ( " galleryID: " + galleryID ) ;
LOGGER . debug ( " username: " + username ) ;
LOGGER . debug ( " requestID: " + requestID ) ;
String url = baseApiUrl + galleryID + " ?iid= " + requestID ;
try {
Document doc = Http . url ( url ) . cookies ( c ) . data ( " username " , username ) . data ( " offset " , Integer . toString ( offset ) )
. data ( " limit " , " 24 " ) . data ( " _csrf " , csfr ) . data ( " id " , requestID )
. ignoreContentType ( ) . post ( ) ;
return new JSONObject ( doc . body ( ) . text ( ) ) ;
} catch ( IOException e ) {
LOGGER . error ( " Got error trying to get page: " + e . getMessage ( ) ) ;
e . printStackTrace ( ) ;
return null ;
}
}
private JSONObject getFirstPageJSON ( Document doc ) {
for ( Element js : doc . select ( " script " ) ) {
LOGGER . info ( js . html ( ) ) ;
if ( js . html ( ) . contains ( " requestid " ) ) {
String json = js . html ( ) . replaceAll ( " window.__initial_body_data= " , " " ) . replaceAll ( " \\ ); " , " " )
. replaceAll ( " ;__wake \\ (.+ " , " " ) ;
LOGGER . info ( " json: " + json ) ;
JSONObject j = new JSONObject ( json ) ;
return j ;
2017-04-27 06:13:11 +02:00
}
}
return null ;
}
2018-07-01 20:55:48 +02:00
private String getGalleryID ( Document doc ) {
for ( Element el : doc . select ( " input[name=set] " ) ) {
try {
String galleryID = el . attr ( " value " ) ;
if ( galleryID . length ( ) = = 8 ) {
return galleryID ;
}
} catch ( NullPointerException e ) {
continue ;
}
}
LOGGER . error ( " Could not find gallery ID " ) ;
return null ;
}
private String getUsername ( Document doc ) {
return doc . select ( " meta[property=og:title] " ) . attr ( " content " ) . replaceAll ( " 's DeviantArt gallery " , " " ) ;
}
2018-06-30 03:09:06 +02:00
2014-06-22 02:08:42 +02:00
@Override
2018-06-30 03:09:06 +02:00
public List < String > getURLsFromJSON ( JSONObject json ) {
2017-10-24 16:33:28 +02:00
List < String > imageURLs = new ArrayList < > ( ) ;
2018-06-30 03:09:06 +02:00
LOGGER . info ( json ) ;
JSONArray results = json . getJSONObject ( " content " ) . getJSONArray ( " results " ) ;
for ( int i = 0 ; i < results . length ( ) ; i + + ) {
Document doc = Jsoup . parseBodyFragment ( results . getJSONObject ( i ) . getString ( " html " ) ) ;
try {
String imageURL = doc . select ( " span " ) . first ( ) . attr ( " data-super-full-img " ) ;
if ( ! imageURL . isEmpty ( ) ) {
imageURLs . add ( imageURL ) ;
2017-04-29 21:35:39 +02:00
}
2018-06-30 03:09:06 +02:00
} catch ( NullPointerException e ) {
LOGGER . info ( i + " does not contain any images " ) ;
2014-03-13 10:26:55 +01:00
}
2015-02-06 08:58:17 +01:00
2014-03-13 10:26:55 +01:00
}
2014-06-22 02:08:42 +02:00
return imageURLs ;
2014-03-13 10:26:55 +01:00
}
2018-06-30 03:09:06 +02:00
// @Override
// public List<String> getDescriptionsFromPage(Document page) {
// List<String> textURLs = new ArrayList<>();
// // Iterate over all thumbnails
// for (Element thumb : page.select("div.zones-container span.thumb")) {
// LOGGER.info(thumb.attr("href"));
// if (isStopped()) {
// break;
// }
// Element img = thumb.select("img").get(0);
// if (img.attr("transparent").equals("false")) {
// continue; // a.thumbs to other albums are invisible
// }
// textURLs.add(thumb.attr("href"));
//
// }
// return textURLs;
// }
2017-04-26 04:18:03 +02:00
2014-06-22 02:08:42 +02:00
@Override
2018-06-30 03:09:06 +02:00
public JSONObject getNextPage ( JSONObject page ) throws IOException {
boolean hasMore = page . getJSONObject ( " content " ) . getBoolean ( " has_more " ) ;
if ( hasMore ) {
return requestPage ( page . getJSONObject ( " content " ) . getInt ( " next_offset " ) , galleryID , username , requestID , csrf , pageCookies ) ;
2015-02-06 08:58:17 +01:00
}
2018-06-29 03:04:05 +02:00
2018-06-30 03:09:06 +02:00
throw new IOException ( " No more pages " ) ;
2014-05-17 07:20:06 +02:00
}
2018-07-01 20:55:48 +02:00
// @Override
// public boolean keepSortOrder() {
// // Don't keep sort order (do not add prefixes).
// // Causes file duplication, as outlined in https://github.com/4pr0n/ripme/issues/113
// return false;
// }
2015-02-08 08:49:21 +01:00
2014-06-22 02:08:42 +02:00
@Override
public void downloadURL ( URL url , int index ) {
addURLToDownload ( url , getPrefix ( index ) , " " , this . url . toExternalForm ( ) , cookies ) ;
2015-01-26 06:12:30 +01:00
sleep ( IMAGE_SLEEP_TIME ) ;
2014-05-17 07:20:06 +02:00
}
2014-05-26 09:31:58 +02:00
/ * *
* Tries to get full size image from thumbnail URL
* @param thumb Thumbnail URL
* @param throwException Whether or not to throw exception when full size image isn ' t found
* @return Full - size image URL
* @throws Exception If it can ' t find the full - size URL
* /
2017-10-24 16:33:28 +02:00
private static String thumbToFull ( String thumb , boolean throwException ) throws Exception {
2014-03-13 10:26:55 +01:00
thumb = thumb . replace ( " http://th " , " http://fc " ) ;
2017-10-24 16:33:28 +02:00
List < String > fields = new ArrayList < > ( Arrays . asList ( thumb . split ( " / " ) ) ) ;
2014-03-13 10:26:55 +01:00
fields . remove ( 4 ) ;
2014-05-17 18:50:07 +02:00
if ( ! fields . get ( 4 ) . equals ( " f " ) & & throwException ) {
// Not a full-size image
throw new Exception ( " Can't get full size image from " + thumb ) ;
}
2014-03-13 10:26:55 +01:00
StringBuilder result = new StringBuilder ( ) ;
for ( int i = 0 ; i < fields . size ( ) ; i + + ) {
if ( i > 0 ) {
result . append ( " / " ) ;
}
result . append ( fields . get ( i ) ) ;
}
return result . toString ( ) ;
}
2017-05-15 19:24:36 +02:00
2014-11-29 05:59:39 +01:00
/ * *
* Attempts to download description for image .
* Comes in handy when people put entire stories in their description .
* If no description was found , returns null .
2017-04-27 06:13:11 +02:00
* @param url The URL the description will be retrieved from
* @param page The gallery page the URL was found on
* @return A String [ ] with first object being the description , and the second object being image file name if found .
2014-11-29 05:59:39 +01:00
* /
2018-06-30 03:09:06 +02:00
// @Override
// public String[] getDescription(String url,Document page) {
// if (isThisATest()) {
// return null;
// }
// try {
// // Fetch the image page
// Response resp = Http.url(url)
// .referrer(this.url)
// .cookies(cookies)
// .response();
// cookies.putAll(resp.cookies());
//
// // Try to find the description
// Document documentz = resp.parse();
// Element ele = documentz.select("div.dev-description").first();
// if (ele == null) {
// throw new IOException("No description found");
// }
// documentz.outputSettings(new Document.OutputSettings().prettyPrint(false));
// ele.select("br").append("\\n");
// ele.select("p").prepend("\\n\\n");
// String fullSize = null;
// Element thumb = page.select("div.zones-container span.thumb[href=\"" + url + "\"]").get(0);
// if (!thumb.attr("data-super-full-img").isEmpty()) {
// fullSize = thumb.attr("data-super-full-img");
// String[] split = fullSize.split("/");
// fullSize = split[split.length - 1];
// } else {
// String spanUrl = thumb.attr("href");
// fullSize = jsonToImage(page,spanUrl.substring(spanUrl.lastIndexOf('-') + 1));
// if (fullSize != null) {
// String[] split = fullSize.split("/");
// fullSize = split[split.length - 1];
// }
// }
// if (fullSize == null) {
// return new String[] {Jsoup.clean(ele.html().replaceAll("\\\\n", System.getProperty("line.separator")), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false))};
// }
// fullSize = fullSize.substring(0, fullSize.lastIndexOf("."));
// return new String[] {Jsoup.clean(ele.html().replaceAll("\\\\n", System.getProperty("line.separator")), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)),fullSize};
// // TODO Make this not make a newline if someone just types \n into the description.
// } catch (IOException ioe) {
// LOGGER.info("Failed to get description at " + url + ": '" + ioe.getMessage() + "'");
// return null;
// }
// }
2017-05-15 19:24:36 +02:00
2014-05-26 09:31:58 +02:00
/ * *
* If largest resolution for image at ' thumb ' is found , starts downloading
* and returns null .
* If it finds a larger resolution on another page , returns the image URL .
* @param thumb Thumbnail URL
* @param page Page the thumbnail is retrieved from
* @return Highest - resolution version of the image based on thumbnail URL and the page .
* /
2017-10-24 16:33:28 +02:00
private String smallToFull ( String thumb , String page ) {
2014-05-17 18:50:07 +02:00
try {
2014-06-01 11:02:36 +02:00
// Fetch the image page
2014-06-22 02:08:42 +02:00
Response resp = Http . url ( page )
. referrer ( this . url )
. cookies ( cookies )
. response ( ) ;
cookies . putAll ( resp . cookies ( ) ) ;
2015-03-30 10:32:27 +02:00
Document doc = resp . parse ( ) ;
2017-04-29 21:35:39 +02:00
Elements els = doc . select ( " img.dev-content-full " ) ;
String fsimage = null ;
2015-03-30 10:32:27 +02:00
// Get the largest resolution image on the page
2018-05-30 04:48:44 +02:00
if ( ! els . isEmpty ( ) ) {
2015-03-30 10:32:27 +02:00
// Large image
2017-04-29 21:35:39 +02:00
fsimage = els . get ( 0 ) . attr ( " src " ) ;
2018-06-03 03:14:41 +02:00
LOGGER . info ( " Found large-scale: " + fsimage ) ;
2017-04-29 21:35:39 +02:00
if ( fsimage . contains ( " //orig " ) ) {
return fsimage ;
}
}
// Try to find the download button
els = doc . select ( " a.dev-page-download " ) ;
2018-05-30 04:48:44 +02:00
if ( ! els . isEmpty ( ) ) {
2017-04-29 21:35:39 +02:00
// Full-size image
String downloadLink = els . get ( 0 ) . attr ( " href " ) ;
2018-06-03 03:14:41 +02:00
LOGGER . info ( " Found download button link: " + downloadLink ) ;
2017-04-29 21:35:39 +02:00
HttpURLConnection con = ( HttpURLConnection ) new URL ( downloadLink ) . openConnection ( ) ;
con . setRequestProperty ( " Referer " , this . url . toString ( ) ) ;
String cookieString = " " ;
for ( Map . Entry < String , String > entry : cookies . entrySet ( ) ) {
cookieString = cookieString + entry . getKey ( ) + " = " + entry . getValue ( ) + " ; " ;
}
cookieString = cookieString . substring ( 0 , cookieString . length ( ) - 1 ) ;
con . setRequestProperty ( " Cookie " , cookieString ) ;
2017-10-24 16:33:28 +02:00
con . setRequestProperty ( " User-Agent " , USER_AGENT ) ;
2017-04-29 21:35:39 +02:00
con . setInstanceFollowRedirects ( true ) ;
con . connect ( ) ;
int code = con . getResponseCode ( ) ;
2017-04-30 04:07:49 +02:00
String location = con . getURL ( ) . toString ( ) ;
2017-04-29 21:35:39 +02:00
con . disconnect ( ) ;
if ( location . contains ( " //orig " ) ) {
fsimage = location ;
2018-06-03 03:14:41 +02:00
LOGGER . info ( " Found image download: " + location ) ;
2017-04-29 21:35:39 +02:00
}
}
if ( fsimage ! = null ) {
2015-03-30 10:32:27 +02:00
return fsimage ;
2014-05-17 18:50:07 +02:00
}
2015-03-30 10:32:27 +02:00
throw new IOException ( " No download page found " ) ;
2014-05-17 18:50:07 +02:00
} catch ( IOException ioe ) {
try {
2018-06-03 03:14:41 +02:00
LOGGER . info ( " Failed to get full size download image at " + page + " : ' " + ioe . getMessage ( ) + " ' " ) ;
2014-05-17 18:50:07 +02:00
String lessThanFull = thumbToFull ( thumb , false ) ;
2018-06-03 03:14:41 +02:00
LOGGER . info ( " Falling back to less-than-full-size image " + lessThanFull ) ;
2014-05-17 18:50:07 +02:00
return lessThanFull ;
} catch ( Exception e ) {
return null ;
}
}
}
2014-03-13 10:26:55 +01:00
/ * *
2018-06-10 18:12:51 +02:00
* Returns DA cookies .
2014-03-13 10:26:55 +01:00
* @return Map of cookies containing session data .
* /
2018-06-10 18:12:51 +02:00
private Map < String , String > getDACookies ( ) {
return RipUtils . getCookiesFromString ( Utils . getConfigString ( " deviantart.cookies " , loginCookies ) ) ;
2014-03-13 10:26:55 +01:00
}
2018-05-19 17:35:57 +02:00
}