Merge pull request #1 from kevin51jiang/VscoRip

Vsco rip
2018-01-14 15:23:51 -05:00 · 2018-01-14 15:23:51 -05:00 · aa39c01771
commit aa39c01771
parent 822c0d366d c14f26feb1
2 changed files with 209 additions and 0 deletions
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java
@ -0,0 +1,186 @@
+package com.rarchives.ripme.ripper.rippers;
+
+import com.rarchives.ripme.ripper.AbstractHTMLRipper;
+import com.rarchives.ripme.utils.Http;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.jsoup.Jsoup;
+
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+/**
+ * For ripping VSCO members' pages.
+ */
+public class VscoRipper extends AbstractHTMLRipper{
+
+    private static final String DOMAIN = "vsco.co",
+                        HOST   = "vsco";
+    
+    public VscoRipper(URL url) throws IOException{
+        super(url);
+    }
+    
+    @Override
+    public boolean canRip(URL url) {
+        if (!url.getHost().endsWith(DOMAIN)) {
+            return false;
+        }
+        // Ignores personalized things (e.g. login, feed) and store page
+        // Allows links to user profiles and links to images.
+        //@TODO: Add support for journals and collections.
+        String u = url.toExternalForm();
+        return !u.contains("/store/")    ||
+               !u.contains("/feed/")     ||
+               !u.contains("/login/")    ||
+               !u.contains("/journal/")   ||
+               !u.contains("/collection")||
+               !u.contains("/images/")    ||
+                u.contains("/media/");   
+        
+    }
+
+    @Override
+    public URL sanitizeURL(URL url) throws MalformedURLException {
+        //no sanitization needed.
+        return url;
+    }
+
+    /**
+     * Recursion FTW
+     * @param page
+     * @return 
+     */
+    @Override
+    public List<String> getURLsFromPage(Document page){
+        List<String> toRip = new ArrayList<>();
+        //If user wanted to rip single image
+        if(url.toString().contains("/media/")){
+            try {
+                toRip.add(vscoImageToURL(url.toExternalForm()));
+            } catch (IOException ex) {
+                logger.debug("Failed to convert " + url.toString() + " to external form.");
+            }
+            
+        }else{//want to rip a member profile
+            /*
+            String baseURL = "https://vsco.co";
+
+
+            //Find all the relative links, adds Base URL, then adds them to an ArrayList
+            List<URL> relativeLinks = new ArrayList<>();
+            Elements links = page.getElementsByTag("a");
+
+            
+            for(Element link : links){
+                System.out.println(link.toString());
+                //if link includes "/media/", add it to the list
+                if(link.attr("href").contains("/media")){
+                    try {
+                        String relativeURL = vscoImageToURL(link.attr("href"));
+                        toRip.add(baseURL + relativeURL);
+                    } catch (IOException ex) {
+                        logger.debug("Could not add \"" + link.toString() + "\" to list for ripping.");
+                    }
+                }
+            }
+            */
+            logger.debug("Sorry, RipMe currently only supports ripping single images.");
+            
+            
+        }
+
+        return toRip;
+    }
+
+    private String vscoImageToURL(String url) throws IOException{
+        Document page = Jsoup.connect(url).userAgent(USER_AGENT)
+                                          .get();
+        //create Elements filled only with Elements with the "meta" tag.
+        Elements metaTags = page.getElementsByTag("meta");
+        String result = "";
+
+        for(Element metaTag : metaTags){
+            //find URL inside meta-tag with property of "og:image"
+            if(metaTag.attr("property").equals("og:image")){
+                String givenURL = metaTag.attr("content");
+                givenURL = givenURL.replaceAll("\\?h=[0-9]+", "");//replace the "?h=xxx" tag at the end of the URL (where each x is a number)
+                
+                result = givenURL;
+                logger.debug("Found image URL: " + givenURL);
+                break;//immediatly stop after getting URL (there should only be 1 image to be downloaded)
+            }
+        }
+        
+        //Means website changed, things need to be fixed.
+        if(result.isEmpty()){
+            logger.error("Could not find image URL at: " + url);
+        }
+        
+        return result;
+        
+    }
+    
+    @Override
+    public String getHost() {
+        return HOST;
+    }
+
+    @Override
+    public String getGID(URL url) throws MalformedURLException {
+        
+        //Single Image
+        Pattern p = Pattern.compile("^https?://vsco\\.co/([a-zA-Z0-9]+)/media/([a-zA-Z0-9]+)");
+        Matcher m = p.matcher(url.toExternalForm());
+        
+        if (m.matches()){
+            // Return the text contained between () in the regex
+            String user = m.group(1);
+            String imageNum = m.group(2).substring(0, 5);//first 5 characters should be enough to make each rip unique
+            return user + "/" + imageNum;
+        }
+        
+        //Member profile (Usernames should all be different, so this should work.
+        p = Pattern.compile("^https?://vsco.co/([a-zA-Z0-9]+)/images/[0-9]+");
+        m = p.matcher(url.toExternalForm());
+        
+        if(m.matches()){
+            String user = m.group(1);
+            return user;
+        }
+        
+        throw new MalformedURLException("Expected a URL to a single image or to a member profile, got " + url + " instead");
+            
+    }
+
+    @Override
+    public String getDomain() {
+        return DOMAIN;
+    }
+
+    @Override
+    public Document getFirstPage() throws IOException {
+        return Http.url(url).get();
+    }
+    
+    @Override
+    public Document getNextPage(Document doc) throws IOException {
+        return super.getNextPage(doc);
+    }
+    
+    @Override
+    public void downloadURL(URL url, int index) {
+        addURLToDownload(url, getPrefix(index));
+    }
+    
+}
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/VscoRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/VscoRipperTest.java
@ -0,0 +1,23 @@
+package com.rarchives.ripme.tst.ripper.rippers;
+
+import com.rarchives.ripme.ripper.rippers.VscoRipper;
+import java.io.IOException;
+import java.net.URL;
+
+public class VscoRipperTest extends RippersTest {
+    
+    
+    public void testSingleImageRip() throws IOException{
+        VscoRipper ripper = new VscoRipper(new URL("https://vsco.co/minijello/media/571cd612542220261a123441"));
+        testRipper(ripper);
+    }
+    
+    public void testGetGID() throws IOException{
+        URL url = new URL("https://vsco.co/minijello/media/571cd612542220261a123441");
+        
+        VscoRipper ripper = new VscoRipper(url);
+        
+        assertEquals("Failed to get GID", "minijello/571cd", ripper.getGID(url));
+    }
+  
+}