package com.rarchives.ripme.ripper.rippers; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import com.rarchives.ripme.ripper.AbstractHTMLRipper; import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite; import com.rarchives.ripme.utils.Http; import java.util.Arrays; public class ChanRipper extends AbstractHTMLRipper { public static List explicit_domains = Arrays.asList( new ChanSite(Arrays.asList("boards.4chan.org"), Arrays.asList("4cdn.org")), new ChanSite(Arrays.asList("archive.moe"), Arrays.asList("data.archive.moe")), new ChanSite(Arrays.asList("4archive.org"), Arrays.asList("imgur.com")), new ChanSite(Arrays.asList("archive.4plebs.org"), Arrays.asList("img.4plebs.org")), new ChanSite(Arrays.asList("fgts.jp"), Arrays.asList("dat.fgts.jp")) ); public static List url_piece_blacklist = Arrays.asList( "=http", "http://imgops.com/", "iqdb.org", "saucenao.com" ); public ChanSite chanSite; public Boolean generalChanSite = true; public ChanRipper(URL url) throws IOException { super(url); for (ChanSite _chanSite : explicit_domains) { if (_chanSite.domains.contains(url.getHost())) { chanSite = _chanSite; generalChanSite = false; } } if (chanSite == null) { chanSite = new ChanSite(Arrays.asList(url.getHost())); } } @Override public String getHost() { String host = this.url.getHost(); host = host.substring(0, host.lastIndexOf('.')); if (host.contains(".")) { // Host has subdomain (www) host = host.substring(host.lastIndexOf('.') + 1); } String board = this.url.toExternalForm().split("/")[3]; return host + "_" + board; } @Override public boolean canRip(URL url) { for (ChanSite _chanSite : explicit_domains) { if (_chanSite.domains.contains(url.getHost())) { return true; } } return url.toExternalForm().contains("/res/") // Most chans || url.toExternalForm().contains("/thread/"); // 4chan, archive.moe } /** * For example the achrives are all known. (Check 4chan-x) * Should be based on the software the specific chan uses. * FoolFuuka uses the same (url) layout as 4chan * */ @Override public String getGID(URL url) throws MalformedURLException { Pattern p; Matcher m; String u = url.toExternalForm(); if (u.contains("/thread/") || u.contains("/res/")) { p = Pattern.compile("^.*\\.[a-z]{1,3}/[a-zA-Z0-9]+/(thread|res)/([0-9]+)(\\.html|\\.php)?.*$"); m = p.matcher(u); if (m.matches()) { return m.group(2); } } throw new MalformedURLException( "Expected *chan URL formats: " + ".*/@/(res|thread)/####.html" + " Got: " + u); } @Override public String getDomain() { return this.url.getHost(); } @Override public Document getFirstPage() throws IOException { return Http.url(this.url).get(); } @Override public List getURLsFromPage(Document page) { List imageURLs = new ArrayList(); Pattern p; Matcher m; elementloop: for (Element link : page.select("a")) { if (!link.hasAttr("href")) { continue; } String href = link.attr("href").trim(); //Check all blacklist items for (String blacklist_item : url_piece_blacklist) { if (href.contains(blacklist_item)) { logger.debug("Skipping link that contains '"+blacklist_item+"': " + href); continue elementloop; } } Boolean self_hosted = false; if (!generalChanSite) { for (String cdnDomain : chanSite.cdnDomains) { if (href.contains(cdnDomain)){ self_hosted = true; } } } if(self_hosted||generalChanSite){ p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm)$", Pattern.CASE_INSENSITIVE); m = p.matcher(href); if (m.matches()) { if (href.startsWith("//")) { href = "http:" + href; } if (href.startsWith("/")) { href = "http://" + this.url.getHost() + href; } // Don't download the same URL twice if (imageURLs.contains(href)) { logger.debug("Already attempted: " + href); continue; } imageURLs.add(href); } } else { //TODO also grab imgur/flickr albums (And all other supported rippers) Maybe add a setting? } } return imageURLs; } @Override public void downloadURL(URL url, int index) { addURLToDownload(url, getPrefix(index)); } }