2014-03-11 09:29:59 +01:00
package com.rarchives.ripme.ripper.rippers ;
2016-04-17 06:26:27 +02:00
import java.io.File ;
2014-03-11 09:29:59 +01:00
import java.io.IOException ;
import java.net.MalformedURLException ;
import java.net.URL ;
import java.util.List ;
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
import org.json.JSONArray ;
import org.json.JSONObject ;
import org.json.JSONTokener ;
2014-04-20 07:41:11 +02:00
import com.rarchives.ripme.ripper.AlbumRipper ;
2015-03-30 00:01:57 +02:00
import com.rarchives.ripme.ui.UpdateUtils ;
2014-06-22 02:08:42 +02:00
import com.rarchives.ripme.utils.Http ;
2014-03-11 09:29:59 +01:00
import com.rarchives.ripme.utils.RipUtils ;
2014-03-13 20:13:01 +01:00
import com.rarchives.ripme.utils.Utils ;
2014-03-11 09:29:59 +01:00
2014-04-20 07:41:11 +02:00
public class RedditRipper extends AlbumRipper {
2014-03-11 09:29:59 +01:00
public RedditRipper ( URL url ) throws IOException {
super ( url ) ;
}
private static final String HOST = " reddit " ;
private static final String DOMAIN = " reddit.com " ;
2017-05-10 00:22:55 +02:00
2017-12-07 08:05:57 +01:00
private static final String REDDIT_USER_AGENT = " RipMe:github.com/RipMeApp/ripme: " + UpdateUtils . getThisJarVersion ( ) + " (by /u/metaprime and /u/ineedmorealts) " ;
2014-03-11 09:29:59 +01:00
private static final int SLEEP_TIME = 2000 ;
//private static final String USER_AGENT = "ripme by /u/4_pr0n github.com/4pr0n/ripme";
2017-05-10 00:22:55 +02:00
2014-03-11 09:29:59 +01:00
private long lastRequestTime = 0 ;
@Override
public boolean canRip ( URL url ) {
return url . getHost ( ) . endsWith ( DOMAIN ) ;
}
@Override
public URL sanitizeURL ( URL url ) throws MalformedURLException {
String u = url . toExternalForm ( ) ;
// Strip '/u/' from URL
u = u . replaceAll ( " reddit \\ .com/u/ " , " reddit.com/user/ " ) ;
return new URL ( u ) ;
}
private URL getJsonURL ( URL url ) throws MalformedURLException {
// Append ".json" to URL in appropriate location.
String result = url . getProtocol ( ) + " :// " + url . getHost ( ) + url . getPath ( ) + " .json " ;
if ( url . getQuery ( ) ! = null ) {
result + = " ? " + url . getQuery ( ) ;
}
return new URL ( result ) ;
}
@Override
public void rip ( ) throws IOException {
URL jsonURL = getJsonURL ( this . url ) ;
while ( true ) {
jsonURL = getAndParseAndReturnNext ( jsonURL ) ;
2015-02-06 12:37:24 +01:00
if ( jsonURL = = null | | isThisATest ( ) | | isStopped ( ) ) {
2014-03-11 09:29:59 +01:00
break ;
}
}
waitForThreads ( ) ;
}
2017-05-10 00:22:55 +02:00
2014-03-11 09:29:59 +01:00
private URL getAndParseAndReturnNext ( URL url ) throws IOException {
JSONArray jsonArray = getJsonArrayFromURL ( url ) , children ;
JSONObject json , data ;
URL nextURL = null ;
for ( int i = 0 ; i < jsonArray . length ( ) ; i + + ) {
json = jsonArray . getJSONObject ( i ) ;
if ( ! json . has ( " data " ) ) {
continue ;
}
data = json . getJSONObject ( " data " ) ;
if ( ! data . has ( " children " ) ) {
continue ;
}
children = data . getJSONArray ( " children " ) ;
for ( int j = 0 ; j < children . length ( ) ; j + + ) {
parseJsonChild ( children . getJSONObject ( j ) ) ;
}
if ( data . has ( " after " ) & & ! data . isNull ( " after " ) ) {
2014-03-13 20:13:01 +01:00
String nextURLString = Utils . stripURLParameter ( url . toExternalForm ( ) , " after " ) ;
2014-03-11 09:29:59 +01:00
if ( nextURLString . contains ( " ? " ) ) {
nextURLString = nextURLString . concat ( " &after= " + data . getString ( " after " ) ) ;
}
else {
nextURLString = nextURLString . concat ( " ?after= " + data . getString ( " after " ) ) ;
}
nextURL = new URL ( nextURLString ) ;
}
}
2015-01-11 09:07:45 +01:00
// Wait to avoid rate-limiting against reddit's API
try {
Thread . sleep ( 2000 ) ;
} catch ( InterruptedException e ) {
2018-06-03 03:14:41 +02:00
LOGGER . warn ( " Interrupted while sleeping " , e ) ;
2015-01-11 09:07:45 +01:00
}
2014-03-11 09:29:59 +01:00
return nextURL ;
}
2017-05-10 00:22:55 +02:00
2014-03-11 09:29:59 +01:00
private JSONArray getJsonArrayFromURL ( URL url ) throws IOException {
// Wait 2 seconds before the next request
long timeDiff = System . currentTimeMillis ( ) - lastRequestTime ;
if ( timeDiff < SLEEP_TIME ) {
try {
Thread . sleep ( timeDiff ) ;
} catch ( InterruptedException e ) {
2018-06-03 03:14:41 +02:00
LOGGER . warn ( " [!] Interrupted while waiting to load next page " , e ) ;
2014-03-11 09:29:59 +01:00
return new JSONArray ( ) ;
}
}
lastRequestTime = System . currentTimeMillis ( ) ;
2015-02-06 12:37:24 +01:00
String jsonString = Http . url ( url )
. ignoreContentType ( )
2015-03-30 00:01:57 +02:00
. userAgent ( REDDIT_USER_AGENT )
2015-02-06 12:37:24 +01:00
. response ( )
. body ( ) ;
2015-01-13 09:27:04 +01:00
Object jsonObj = new JSONTokener ( jsonString ) . nextValue ( ) ;
2014-03-11 09:29:59 +01:00
JSONArray jsonArray = new JSONArray ( ) ;
if ( jsonObj instanceof JSONObject ) {
2017-10-24 16:33:28 +02:00
jsonArray . put ( jsonObj ) ;
2017-05-10 02:50:32 +02:00
} else if ( jsonObj instanceof JSONArray ) {
2014-03-11 09:29:59 +01:00
jsonArray = ( JSONArray ) jsonObj ;
} else {
2018-06-03 03:14:41 +02:00
LOGGER . warn ( " [!] Unable to parse JSON: " + jsonString ) ;
2014-03-11 09:29:59 +01:00
}
return jsonArray ;
}
private void parseJsonChild ( JSONObject child ) {
String kind = child . getString ( " kind " ) ;
JSONObject data = child . getJSONObject ( " data " ) ;
if ( kind . equals ( " t1 " ) ) {
// Comment
handleBody ( data . getString ( " body " ) , data . getString ( " id " ) ) ;
}
else if ( kind . equals ( " t3 " ) ) {
// post
if ( data . getBoolean ( " is_self " ) ) {
// TODO Parse self text
handleBody ( data . getString ( " selftext " ) , data . getString ( " id " ) ) ;
} else {
// Get link
handleURL ( data . getString ( " url " ) , data . getString ( " id " ) ) ;
}
if ( data . has ( " replies " ) & & data . get ( " replies " ) instanceof JSONObject ) {
JSONArray replies = data . getJSONObject ( " replies " )
. getJSONObject ( " data " )
. getJSONArray ( " children " ) ;
for ( int i = 0 ; i < replies . length ( ) ; i + + ) {
parseJsonChild ( replies . getJSONObject ( i ) ) ;
}
}
}
}
2017-10-24 16:33:28 +02:00
private void handleBody ( String body , String id ) {
2014-03-11 09:29:59 +01:00
Pattern p = RipUtils . getURLRegex ( ) ;
Matcher m = p . matcher ( body ) ;
while ( m . find ( ) ) {
2017-05-09 23:57:39 +02:00
String url = m . group ( 1 ) ;
while ( url . endsWith ( " ) " ) ) {
url = url . substring ( 0 , url . length ( ) - 1 ) ;
2014-07-20 09:45:40 +02:00
}
2017-05-09 23:57:39 +02:00
handleURL ( url , id ) ;
2014-03-11 09:29:59 +01:00
}
}
2017-10-24 16:33:28 +02:00
private void handleURL ( String theUrl , String id ) {
2014-03-11 09:29:59 +01:00
URL originalURL ;
try {
originalURL = new URL ( theUrl ) ;
} catch ( MalformedURLException e ) {
return ;
}
List < URL > urls = RipUtils . getFilesFromURL ( originalURL ) ;
if ( urls . size ( ) = = 1 ) {
2016-04-17 06:26:27 +02:00
String url = urls . get ( 0 ) . toExternalForm ( ) ;
Pattern p = Pattern . compile ( " https?://i.reddituploads.com/([a-zA-Z0-9]+) \\ ?.* " ) ;
Matcher m = p . matcher ( url ) ;
if ( m . matches ( ) ) {
// It's from reddituploads. Assume .jpg extension.
String savePath = this . workingDir + File . separator ;
savePath + = id + " - " + m . group ( 1 ) + " .jpg " ;
addURLToDownload ( urls . get ( 0 ) , new File ( savePath ) ) ;
}
else {
addURLToDownload ( urls . get ( 0 ) , id + " - " , " " , theUrl , null ) ;
}
2014-03-11 09:29:59 +01:00
} else if ( urls . size ( ) > 1 ) {
for ( int i = 0 ; i < urls . size ( ) ; i + + ) {
2014-05-26 09:31:58 +02:00
String prefix = id + " - " ;
if ( Utils . getConfigBoolean ( " download.save_order " , true ) ) {
prefix + = String . format ( " %03d- " , i + 1 ) ;
}
2014-06-07 10:25:39 +02:00
addURLToDownload ( urls . get ( i ) , prefix , " " , theUrl , null ) ;
2014-03-11 09:29:59 +01:00
}
}
}
@Override
public String getHost ( ) {
return HOST ;
}
@Override
public String getGID ( URL url ) throws MalformedURLException {
// User
2017-10-24 16:33:28 +02:00
Pattern p = Pattern . compile ( " ^https?://[a-zA-Z0-9.]{0,4}reddit \\ .com/(user|u)/([a-zA-Z0-9_ \\ -]{3,}).*$ " ) ;
2014-03-11 09:29:59 +01:00
Matcher m = p . matcher ( url . toExternalForm ( ) ) ;
if ( m . matches ( ) ) {
return " user_ " + m . group ( m . groupCount ( ) ) ;
}
// Post
2017-10-24 16:33:28 +02:00
p = Pattern . compile ( " ^https?://[a-zA-Z0-9.]{0,4}reddit \\ .com/.*comments/([a-zA-Z0-9]{1,8}).*$ " ) ;
2014-03-11 09:29:59 +01:00
m = p . matcher ( url . toExternalForm ( ) ) ;
if ( m . matches ( ) ) {
return " post_ " + m . group ( m . groupCount ( ) ) ;
}
// Subreddit
2017-10-24 16:33:28 +02:00
p = Pattern . compile ( " ^https?://[a-zA-Z0-9.]{0,4}reddit \\ .com/r/([a-zA-Z0-9_]+).*$ " ) ;
2014-03-11 09:29:59 +01:00
m = p . matcher ( url . toExternalForm ( ) ) ;
if ( m . matches ( ) ) {
return " sub_ " + m . group ( m . groupCount ( ) ) ;
}
throw new MalformedURLException ( " Only accepts user pages, subreddits, or post, can't understand " + url ) ;
}
}