2014-04-13 01:54:45 +02:00
package com.rarchives.ripme.ripper.rippers ;
import java.io.IOException ;
import java.net.MalformedURLException ;
import java.net.URL ;
2014-06-22 02:08:42 +02:00
import java.util.ArrayList ;
import java.util.List ;
2014-04-13 01:54:45 +02:00
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
import org.jsoup.nodes.Document ;
import org.jsoup.nodes.Element ;
2014-06-23 04:17:40 +02:00
import com.rarchives.ripme.ripper.AbstractHTMLRipper ;
2014-09-08 00:36:08 +02:00
import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite ;
2014-06-22 02:08:42 +02:00
import com.rarchives.ripme.utils.Http ;
2014-09-08 00:36:08 +02:00
import java.util.Arrays ;
2014-04-13 01:54:45 +02:00
2014-06-23 04:17:40 +02:00
public class ChanRipper extends AbstractHTMLRipper {
2014-09-08 00:36:08 +02:00
//ArrayList<String> explicit_domains = new ArrayList<String>();
public static List < ChanSite > explicit_domains = Arrays . asList (
//Tested (main boards)
//Untested (main boards)
new ChanSite ( Arrays . asList ( " anon-ib.com " ) ) ,
new ChanSite ( Arrays . asList ( " boards.4chan.org " ) , Arrays . asList ( " 4cdn.org " ) ) ,
//Tested (archives)
new ChanSite ( Arrays . asList ( " archive.moe " ) , Arrays . asList ( " data.archive.moe " ) ) , //4chan archive (successor of foolz archive) Archives: [ a / biz / c / co / diy / gd / i / int / jp / m / mlp / out / po / q / s4s / sci / sp / tg / tv / v / vg / vp / vr / wsg ]
//Untested (archives)new ChanSite(Arrays.asList("anon-ib.com")),
new ChanSite ( Arrays . asList ( " 4archive.org " ) , Arrays . asList ( " imgur.com " ) ) , //4chan archive (on demand)
new ChanSite ( Arrays . asList ( " archive.4plebs.org " ) , Arrays . asList ( " img.4plebs.org " ) ) , //4chan archive Archives: [ adv / f / hr / o / pol / s4s / tg / trv / tv / x ] Boards: [ plebs ]
new ChanSite ( Arrays . asList ( " fgts.jp " ) , Arrays . asList ( " dat.fgts.jp " ) ) //4chan archive Archives: [ asp / cm / h / hc / hm / n / p / r / s / soc / y ]
) ;
public static List < String > url_piece_blacklist = Arrays . asList (
" =http " ,
" http://imgops.com/ " ,
" iqdb.org " ,
" saucenao.com "
) ;
public ChanSite chanSite ;
public Boolean generalChanSite = true ;
2014-04-13 01:54:45 +02:00
public ChanRipper ( URL url ) throws IOException {
super ( url ) ;
2014-09-08 00:36:08 +02:00
for ( ChanSite _chanSite : explicit_domains ) {
for ( String host : _chanSite . domains ) {
if ( url . getHost ( ) . equals ( host ) ) {
chanSite = _chanSite ;
generalChanSite = false ;
}
}
}
if ( chanSite = = null ) {
chanSite = new ChanSite ( Arrays . asList ( " url.getHost() " ) ) ;
}
2014-04-13 01:54:45 +02:00
}
@Override
public String getHost ( ) {
String host = this . url . getHost ( ) ;
host = host . substring ( 0 , host . lastIndexOf ( '.' ) ) ;
if ( host . contains ( " . " ) ) {
// Host has subdomain (www)
host = host . substring ( host . lastIndexOf ( '.' ) + 1 ) ;
}
String board = this . url . toExternalForm ( ) . split ( " / " ) [ 3 ] ;
return host + " _ " + board ;
}
@Override
2014-09-08 00:36:08 +02:00
public boolean canRip ( URL url ) {
//explicit_domains testing
for ( ChanSite _chanSite : explicit_domains ) {
for ( String host : _chanSite . domains ) {
if ( url . getHost ( ) . equals ( host ) ) {
return true ;
}
}
2014-05-03 10:13:14 +02:00
}
2014-09-08 00:36:08 +02:00
//It'll fail further down the road.
return url . toExternalForm ( ) . contains ( " /res/ " ) // Most chans
| | url . toExternalForm ( ) . contains ( " /thread/ " ) ; // 4chan, archive.moe
2014-04-13 01:54:45 +02:00
}
2014-09-08 00:36:08 +02:00
/ * *
* For example the achrives are all known . ( Check 4chan - x )
* Should be based on the software the specific chan uses .
* FoolFuuka uses the same ( url ) layout as 4chan
* * /
2014-04-13 01:54:45 +02:00
@Override
public String getGID ( URL url ) throws MalformedURLException {
Pattern p ; Matcher m ;
2014-09-08 00:36:08 +02:00
String u = url . toExternalForm ( ) ;
if ( u . contains ( " /thread/ " ) | | u . contains ( " /res/ " ) ) {
p = Pattern . compile ( " ^.* \\ .[a-z]{1,3}/[a-zA-Z0-9]+/(thread|res)/([0-9]+)( \\ .html| \\ .php)?.*$ " ) ;
2014-04-25 04:42:10 +02:00
m = p . matcher ( u ) ;
if ( m . matches ( ) ) {
2014-05-03 10:13:14 +02:00
return m . group ( 2 ) ;
2014-04-25 04:42:10 +02:00
}
}
2014-04-13 01:54:45 +02:00
throw new MalformedURLException (
" Expected *chan URL formats: "
2014-09-08 00:36:08 +02:00
+ " .*/@/(res|thread)/####.html "
2014-04-25 04:42:10 +02:00
+ " Got: " + u ) ;
2014-04-13 01:54:45 +02:00
}
@Override
2014-06-22 02:08:42 +02:00
public String getDomain ( ) {
return this . url . getHost ( ) ;
}
@Override
public Document getFirstPage ( ) throws IOException {
return Http . url ( this . url ) . get ( ) ;
}
@Override
public List < String > getURLsFromPage ( Document page ) {
List < String > imageURLs = new ArrayList < String > ( ) ;
2014-04-13 01:54:45 +02:00
Pattern p ; Matcher m ;
2014-09-08 00:36:08 +02:00
elementloop :
2014-06-22 02:08:42 +02:00
for ( Element link : page . select ( " a " ) ) {
2014-04-13 01:54:45 +02:00
if ( ! link . hasAttr ( " href " ) ) {
continue ;
}
2014-09-08 00:36:08 +02:00
String href = link . attr ( " href " ) ;
//Check all blacklist items
for ( String blacklist_item : url_piece_blacklist ) {
if ( href . contains ( blacklist_item ) ) {
logger . debug ( " Skipping link that contains ' " + blacklist_item + " ': " + href ) ;
continue elementloop ;
}
2014-04-13 01:54:45 +02:00
}
2014-09-08 00:36:08 +02:00
Boolean self_hosted = false ;
if ( ! generalChanSite ) {
for ( String cdnDomain : chanSite . cdnDomains ) {
if ( href . contains ( cdnDomain ) ) {
self_hosted = true ;
}
}
2014-05-03 10:13:14 +02:00
}
2014-09-08 00:36:08 +02:00
if ( self_hosted | | generalChanSite ) {
p = Pattern . compile ( " ^.* \\ .(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm)$ " , Pattern . CASE_INSENSITIVE ) ;
m = p . matcher ( href ) ;
if ( m . matches ( ) ) {
if ( href . startsWith ( " // " ) ) {
href = " http: " + href ;
}
if ( href . startsWith ( " / " ) ) {
href = " http:// " + this . url . getHost ( ) + href ;
}
// Don't download the same URL twice
if ( imageURLs . contains ( href ) ) {
logger . debug ( " Already attempted: " + href ) ;
continue ;
}
imageURLs . add ( href ) ;
2014-04-13 01:54:45 +02:00
}
2014-09-08 00:36:08 +02:00
} else {
//TODO also grab imgur/flickr albums (And all other supported rippers) Maybe add a setting?
}
2014-04-13 01:54:45 +02:00
}
2014-06-22 02:08:42 +02:00
return imageURLs ;
}
@Override
public void downloadURL ( URL url , int index ) {
addURLToDownload ( url , getPrefix ( index ) ) ;
2014-09-08 00:36:08 +02:00
}
}