From 64a6f9257b08a659426c065268d36cf0a6b836fe Mon Sep 17 00:00:00 2001 From: Kevin Jiang Date: Sun, 14 Jan 2018 15:21:20 -0500 Subject: [PATCH] VSCO Ripper Single Image Currently can only support ripping a single Image. More study will be needed to rip member profiles and collections. (Maybe JSON?) --- .../ripme/ripper/rippers/VscoRipper.java | 140 +++++++++++++----- 1 file changed, 105 insertions(+), 35 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java index 47d14547..5665ee84 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java @@ -1,17 +1,23 @@ package com.rarchives.ripme.ripper.rippers; import com.rarchives.ripme.ripper.AbstractHTMLRipper; -import com.rarchives.ripme.ripper.AlbumRipper; import com.rarchives.ripme.utils.Http; + import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.jsoup.Jsoup; + import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; /** * For ripping VSCO members' pages. @@ -32,15 +38,15 @@ public class VscoRipper extends AbstractHTMLRipper{ } // Ignores personalized things (e.g. login, feed) and store page // Allows links to user profiles and links to images. - //TODO: Add support for journals and collections. + //@TODO: Add support for journals and collections. String u = url.toExternalForm(); - return !u.contains("/store") || - !u.contains("/feed") || - !u.contains("/login") || - !u.contains("/journal") || + return !u.contains("/store/") || + !u.contains("/feed/") || + !u.contains("/login/") || + !u.contains("/journal/") || !u.contains("/collection")|| - u.contains("images") || - u.contains("media"); + !u.contains("/images/") || + u.contains("/media/"); } @@ -50,11 +56,81 @@ public class VscoRipper extends AbstractHTMLRipper{ return url; } + /** + * Recursion FTW + * @param page + * @return + */ @Override - public void rip() throws IOException { - throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + public List getURLsFromPage(Document page){ + List toRip = new ArrayList<>(); + //If user wanted to rip single image + if(url.toString().contains("/media/")){ + try { + toRip.add(vscoImageToURL(url.toExternalForm())); + } catch (IOException ex) { + logger.debug("Failed to convert " + url.toString() + " to external form."); + } + + }else{//want to rip a member profile + /* + String baseURL = "https://vsco.co"; + + + //Find all the relative links, adds Base URL, then adds them to an ArrayList + List relativeLinks = new ArrayList<>(); + Elements links = page.getElementsByTag("a"); + + + for(Element link : links){ + System.out.println(link.toString()); + //if link includes "/media/", add it to the list + if(link.attr("href").contains("/media")){ + try { + String relativeURL = vscoImageToURL(link.attr("href")); + toRip.add(baseURL + relativeURL); + } catch (IOException ex) { + logger.debug("Could not add \"" + link.toString() + "\" to list for ripping."); + } + } + } + */ + logger.debug("Sorry, RipMe currently only supports ripping single images."); + + + } + + return toRip; } + private String vscoImageToURL(String url) throws IOException{ + Document page = Jsoup.connect(url).userAgent(USER_AGENT) + .get(); + //create Elements filled only with Elements with the "meta" tag. + Elements metaTags = page.getElementsByTag("meta"); + String result = ""; + + for(Element metaTag : metaTags){ + //find URL inside meta-tag with property of "og:image" + if(metaTag.attr("property").equals("og:image")){ + String givenURL = metaTag.attr("content"); + givenURL = givenURL.replaceAll("\\?h=[0-9]+", "");//replace the "?h=xxx" tag at the end of the URL (where each x is a number) + + result = givenURL; + logger.debug("Found image URL: " + givenURL); + break;//immediatly stop after getting URL (there should only be 1 image to be downloaded) + } + } + + //Means website changed, things need to be fixed. + if(result.isEmpty()){ + logger.error("Could not find image URL at: " + url); + } + + return result; + + } + @Override public String getHost() { return HOST; @@ -62,19 +138,29 @@ public class VscoRipper extends AbstractHTMLRipper{ @Override public String getGID(URL url) throws MalformedURLException { + + //Single Image Pattern p = Pattern.compile("^https?://vsco\\.co/([a-zA-Z0-9]+)/media/([a-zA-Z0-9]+)"); Matcher m = p.matcher(url.toExternalForm()); - if (!m.matches()){ - throw new MalformedURLException("Expected " + DOMAIN + " URL format: " + - "vsco.co/username/media/postNumber - got " + url + " instead"); - + + if (m.matches()){ + // Return the text contained between () in the regex + String user = m.group(1); + String imageNum = m.group(2).substring(0, 5);//first 5 characters should be enough to make each rip unique + return user + "/" + imageNum; } - // Return the text contained between () in the regex - String user = m.group(1); - String imageNum = m.group(2); - return user + "/" + imageNum; + //Member profile (Usernames should all be different, so this should work. + p = Pattern.compile("^https?://vsco.co/([a-zA-Z0-9]+)/images/[0-9]+"); + m = p.matcher(url.toExternalForm()); + if(m.matches()){ + String user = m.group(1); + return user; + } + + throw new MalformedURLException("Expected a URL to a single image or to a member profile, got " + url + " instead"); + } @Override @@ -91,23 +177,7 @@ public class VscoRipper extends AbstractHTMLRipper{ public Document getNextPage(Document doc) throws IOException { return super.getNextPage(doc); } - @Override - public List getURLsFromPage(Document page) { - List result = new ArrayList<>(); - - //get them from page - for(Element el : page.select("meta.og:image")){ - //MUST replace im.vsco instead of just "im" because the URL to image could contain string "im" - result.add( - el.attr("content").replaceFirst("im.vsco", "images.vsco")); //sanitize - - } - - - - return result; - } - + @Override public void downloadURL(URL url, int index) { addURLToDownload(url, getPrefix(index));