VSCO Ripper Start

Just started the VSCO ripper by emulating another ripper and the instructions
from the wiki. Not yet functional, this commit is just to back stuff up.
This commit is contained in:
Kevin Jiang 2018-01-11 01:18:47 -05:00
parent 3d7cbd18ca
commit 9dbd566340

View File

@ -0,0 +1,116 @@
package com.rarchives.ripme.ripper.rippers;
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.ripper.AlbumRipper;
import com.rarchives.ripme.utils.Http;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
/**
* For ripping VSCO members' pages.
*/
public class VscoRipper extends AbstractHTMLRipper{
private static final String DOMAIN = "vsco.co",
HOST = "vsco";
public VscoRipper(URL url) throws IOException{
super(url);
}
@Override
public boolean canRip(URL url) {
if (!url.getHost().endsWith(DOMAIN)) {
return false;
}
// Ignores personalized things (e.g. login, feed) and store page
// Allows links to user profiles and links to images.
//TODO: Add support for journals and collections.
String u = url.toExternalForm();
return !u.contains("/store") ||
!u.contains("/feed") ||
!u.contains("/login") ||
!u.contains("/journal") ||
!u.contains("/collection")||
u.contains("images") ||
u.contains("media");
}
@Override
public URL sanitizeURL(URL url) throws MalformedURLException {
//no sanitization needed.
return url;
}
@Override
public void rip() throws IOException {
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
}
@Override
public String getHost() {
return HOST;
}
@Override
public String getGID(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://vsco\\.co/([a-zA-Z0-9]+)/media/([a-zA-Z0-9]+)");
Matcher m = p.matcher(url.toExternalForm());
if (!m.matches()){
throw new MalformedURLException("Expected " + DOMAIN + " URL format: " +
"vsco.co/username/media/postNumber - got " + url + " instead");
}
// Return the text contained between () in the regex
String user = m.group(1);
String imageNum = m.group(2);
return user + "/" + imageNum;
}
@Override
public String getDomain() {
return DOMAIN;
}
@Override
public Document getFirstPage() throws IOException {
return Http.url(url).get();
}
@Override
public Document getNextPage(Document doc) throws IOException {
return super.getNextPage(doc);
}
@Override
public List<String> getURLsFromPage(Document page) {
List<String> result = new ArrayList<>();
//get them from page
for(Element el : page.select("meta.og:image")){
//MUST replace im.vsco instead of just "im" because the URL to image could contain string "im"
result.add(
el.attr("content").replaceFirst("im.vsco", "images.vsco")); //sanitize
}
return result;
}
@Override
public void downloadURL(URL url, int index) {
addURLToDownload(url, getPrefix(index));
}
}