From 9dbd566340aa7cfa29ab9173c664eedb88bc6a57 Mon Sep 17 00:00:00 2001 From: Kevin Jiang Date: Thu, 11 Jan 2018 01:18:47 -0500 Subject: [PATCH 1/3] VSCO Ripper Start Just started the VSCO ripper by emulating another ripper and the instructions from the wiki. Not yet functional, this commit is just to back stuff up. --- .../ripme/ripper/rippers/VscoRipper.java | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java new file mode 100644 index 00000000..47d14547 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java @@ -0,0 +1,116 @@ +package com.rarchives.ripme.ripper.rippers; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.ripper.AlbumRipper; +import com.rarchives.ripme.utils.Http; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +/** + * For ripping VSCO members' pages. + */ +public class VscoRipper extends AbstractHTMLRipper{ + + private static final String DOMAIN = "vsco.co", + HOST = "vsco"; + + public VscoRipper(URL url) throws IOException{ + super(url); + } + + @Override + public boolean canRip(URL url) { + if (!url.getHost().endsWith(DOMAIN)) { + return false; + } + // Ignores personalized things (e.g. login, feed) and store page + // Allows links to user profiles and links to images. + //TODO: Add support for journals and collections. + String u = url.toExternalForm(); + return !u.contains("/store") || + !u.contains("/feed") || + !u.contains("/login") || + !u.contains("/journal") || + !u.contains("/collection")|| + u.contains("images") || + u.contains("media"); + + } + + @Override + public URL sanitizeURL(URL url) throws MalformedURLException { + //no sanitization needed. + return url; + } + + @Override + public void rip() throws IOException { + throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + } + + @Override + public String getHost() { + return HOST; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Pattern p = Pattern.compile("^https?://vsco\\.co/([a-zA-Z0-9]+)/media/([a-zA-Z0-9]+)"); + Matcher m = p.matcher(url.toExternalForm()); + if (!m.matches()){ + throw new MalformedURLException("Expected " + DOMAIN + " URL format: " + + "vsco.co/username/media/postNumber - got " + url + " instead"); + + } + // Return the text contained between () in the regex + String user = m.group(1); + String imageNum = m.group(2); + + return user + "/" + imageNum; + + } + + @Override + public String getDomain() { + return DOMAIN; + } + + @Override + public Document getFirstPage() throws IOException { + return Http.url(url).get(); + } + + @Override + public Document getNextPage(Document doc) throws IOException { + return super.getNextPage(doc); + } + @Override + public List getURLsFromPage(Document page) { + List result = new ArrayList<>(); + + //get them from page + for(Element el : page.select("meta.og:image")){ + //MUST replace im.vsco instead of just "im" because the URL to image could contain string "im" + result.add( + el.attr("content").replaceFirst("im.vsco", "images.vsco")); //sanitize + + } + + + + return result; + } + + @Override + public void downloadURL(URL url, int index) { + addURLToDownload(url, getPrefix(index)); + } + +} From 64a6f9257b08a659426c065268d36cf0a6b836fe Mon Sep 17 00:00:00 2001 From: Kevin Jiang Date: Sun, 14 Jan 2018 15:21:20 -0500 Subject: [PATCH 2/3] VSCO Ripper Single Image Currently can only support ripping a single Image. More study will be needed to rip member profiles and collections. (Maybe JSON?) --- .../ripme/ripper/rippers/VscoRipper.java | 140 +++++++++++++----- 1 file changed, 105 insertions(+), 35 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java index 47d14547..5665ee84 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/VscoRipper.java @@ -1,17 +1,23 @@ package com.rarchives.ripme.ripper.rippers; import com.rarchives.ripme.ripper.AbstractHTMLRipper; -import com.rarchives.ripme.ripper.AlbumRipper; import com.rarchives.ripme.utils.Http; + import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.jsoup.Jsoup; + import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; /** * For ripping VSCO members' pages. @@ -32,15 +38,15 @@ public class VscoRipper extends AbstractHTMLRipper{ } // Ignores personalized things (e.g. login, feed) and store page // Allows links to user profiles and links to images. - //TODO: Add support for journals and collections. + //@TODO: Add support for journals and collections. String u = url.toExternalForm(); - return !u.contains("/store") || - !u.contains("/feed") || - !u.contains("/login") || - !u.contains("/journal") || + return !u.contains("/store/") || + !u.contains("/feed/") || + !u.contains("/login/") || + !u.contains("/journal/") || !u.contains("/collection")|| - u.contains("images") || - u.contains("media"); + !u.contains("/images/") || + u.contains("/media/"); } @@ -50,11 +56,81 @@ public class VscoRipper extends AbstractHTMLRipper{ return url; } + /** + * Recursion FTW + * @param page + * @return + */ @Override - public void rip() throws IOException { - throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + public List getURLsFromPage(Document page){ + List toRip = new ArrayList<>(); + //If user wanted to rip single image + if(url.toString().contains("/media/")){ + try { + toRip.add(vscoImageToURL(url.toExternalForm())); + } catch (IOException ex) { + logger.debug("Failed to convert " + url.toString() + " to external form."); + } + + }else{//want to rip a member profile + /* + String baseURL = "https://vsco.co"; + + + //Find all the relative links, adds Base URL, then adds them to an ArrayList + List relativeLinks = new ArrayList<>(); + Elements links = page.getElementsByTag("a"); + + + for(Element link : links){ + System.out.println(link.toString()); + //if link includes "/media/", add it to the list + if(link.attr("href").contains("/media")){ + try { + String relativeURL = vscoImageToURL(link.attr("href")); + toRip.add(baseURL + relativeURL); + } catch (IOException ex) { + logger.debug("Could not add \"" + link.toString() + "\" to list for ripping."); + } + } + } + */ + logger.debug("Sorry, RipMe currently only supports ripping single images."); + + + } + + return toRip; } + private String vscoImageToURL(String url) throws IOException{ + Document page = Jsoup.connect(url).userAgent(USER_AGENT) + .get(); + //create Elements filled only with Elements with the "meta" tag. + Elements metaTags = page.getElementsByTag("meta"); + String result = ""; + + for(Element metaTag : metaTags){ + //find URL inside meta-tag with property of "og:image" + if(metaTag.attr("property").equals("og:image")){ + String givenURL = metaTag.attr("content"); + givenURL = givenURL.replaceAll("\\?h=[0-9]+", "");//replace the "?h=xxx" tag at the end of the URL (where each x is a number) + + result = givenURL; + logger.debug("Found image URL: " + givenURL); + break;//immediatly stop after getting URL (there should only be 1 image to be downloaded) + } + } + + //Means website changed, things need to be fixed. + if(result.isEmpty()){ + logger.error("Could not find image URL at: " + url); + } + + return result; + + } + @Override public String getHost() { return HOST; @@ -62,19 +138,29 @@ public class VscoRipper extends AbstractHTMLRipper{ @Override public String getGID(URL url) throws MalformedURLException { + + //Single Image Pattern p = Pattern.compile("^https?://vsco\\.co/([a-zA-Z0-9]+)/media/([a-zA-Z0-9]+)"); Matcher m = p.matcher(url.toExternalForm()); - if (!m.matches()){ - throw new MalformedURLException("Expected " + DOMAIN + " URL format: " + - "vsco.co/username/media/postNumber - got " + url + " instead"); - + + if (m.matches()){ + // Return the text contained between () in the regex + String user = m.group(1); + String imageNum = m.group(2).substring(0, 5);//first 5 characters should be enough to make each rip unique + return user + "/" + imageNum; } - // Return the text contained between () in the regex - String user = m.group(1); - String imageNum = m.group(2); - return user + "/" + imageNum; + //Member profile (Usernames should all be different, so this should work. + p = Pattern.compile("^https?://vsco.co/([a-zA-Z0-9]+)/images/[0-9]+"); + m = p.matcher(url.toExternalForm()); + if(m.matches()){ + String user = m.group(1); + return user; + } + + throw new MalformedURLException("Expected a URL to a single image or to a member profile, got " + url + " instead"); + } @Override @@ -91,23 +177,7 @@ public class VscoRipper extends AbstractHTMLRipper{ public Document getNextPage(Document doc) throws IOException { return super.getNextPage(doc); } - @Override - public List getURLsFromPage(Document page) { - List result = new ArrayList<>(); - - //get them from page - for(Element el : page.select("meta.og:image")){ - //MUST replace im.vsco instead of just "im" because the URL to image could contain string "im" - result.add( - el.attr("content").replaceFirst("im.vsco", "images.vsco")); //sanitize - - } - - - - return result; - } - + @Override public void downloadURL(URL url, int index) { addURLToDownload(url, getPrefix(index)); From c14f26feb1727742f84e264bf0e24a535b75138a Mon Sep 17 00:00:00 2001 From: Kevin Jiang Date: Sun, 14 Jan 2018 15:21:26 -0500 Subject: [PATCH 3/3] VSCO Ripper Single Image Currently can only support ripping a single Image. More study will be needed to rip member profiles and collections. (Maybe JSON?) --- .../tst/ripper/rippers/VscoRipperTest.java | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 src/test/java/com/rarchives/ripme/tst/ripper/rippers/VscoRipperTest.java diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/VscoRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/VscoRipperTest.java new file mode 100644 index 00000000..15a5eeef --- /dev/null +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/VscoRipperTest.java @@ -0,0 +1,23 @@ +package com.rarchives.ripme.tst.ripper.rippers; + +import com.rarchives.ripme.ripper.rippers.VscoRipper; +import java.io.IOException; +import java.net.URL; + +public class VscoRipperTest extends RippersTest { + + + public void testSingleImageRip() throws IOException{ + VscoRipper ripper = new VscoRipper(new URL("https://vsco.co/minijello/media/571cd612542220261a123441")); + testRipper(ripper); + } + + public void testGetGID() throws IOException{ + URL url = new URL("https://vsco.co/minijello/media/571cd612542220261a123441"); + + VscoRipper ripper = new VscoRipper(url); + + assertEquals("Failed to get GID", "minijello/571cd", ripper.getGID(url)); + } + +}