From acc38e22fcf03f82552b06e5d7a2662190123f4b Mon Sep 17 00:00:00 2001 From: MetaPrime Date: Mon, 13 Jan 2025 14:01:49 -0800 Subject: [PATCH 1/3] Implement FapwizRipper --- .../ripme/ripper/rippers/FapwizRipper.java | 112 ++++++++++++++++++ .../tst/ripper/rippers/FapwizRipperTest.java | 27 +++++ 2 files changed, 139 insertions(+) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/FapwizRipper.java create mode 100644 src/test/java/com/rarchives/ripme/tst/ripper/rippers/FapwizRipperTest.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/FapwizRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/FapwizRipper.java new file mode 100644 index 000000000..ad2013830 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/FapwizRipper.java @@ -0,0 +1,112 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jsoup.nodes.Document; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; + +public class FapwizRipper extends AbstractHTMLRipper { + + private static final Logger logger = LogManager.getLogger(FapwizRipper.class); + + private static final Pattern CATEGORY_PATTERN = + Pattern.compile("https?://fapwiz.com/category/([a-zA-Z1-9_-]+)/?$"); + + private static final Pattern USER_PATTERN = + Pattern.compile("https?://fapwiz.com/([a-zA-Z1-9_-]+)/?$"); + + private static final Pattern POST_PATTERN = + Pattern.compile("https?://fapwiz.com/([a-zA-Z1-9_-]+)/([a-zA-Z1-9_-]+)/?$"); + + public FapwizRipper(URL url) throws IOException { + super(url); + } + + @Override + public String getHost() { + return "fapwiz"; + } + + @Override + public String getDomain() { + return "fapwiz.com"; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Matcher m; + + m = CATEGORY_PATTERN.matcher(url.toExternalForm()); + if (m.matches()) { + return "category_" + m.group(1); + } + + m = USER_PATTERN.matcher(url.toExternalForm()); + if (m.matches()) { + return "user_" + m.group(1); + } + + m = POST_PATTERN.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1) + "_" + m.group(2); + } + + throw new MalformedURLException("Expected fapwiz URL format: " + + "fapwiz.com/NAME - got " + url + " instead"); + } + + @Override + public List getURLsFromPage(Document doc) { + List result = new ArrayList<>(); + + Matcher m; + + m = CATEGORY_PATTERN.matcher(url.toExternalForm()); + if (m.matches()) { + // structure of user page - does it work for category pages too? + doc.select(".post-items-holder img").forEach(e -> { + String imgSrc = e.attr("src"); + // Replace -thumbnail.jpg with .mp4 + String videoSrc = imgSrc.replace("-thumbnail.jpg", ".mp4"); + result.add(videoSrc); + }); + } + + m = USER_PATTERN.matcher(url.toExternalForm()); + if (m.matches()) { + // structure of user page + doc.select(".post-items-holder img").forEach(e -> { + String imgSrc = e.attr("src"); + // Replace -thumbnail.jpg with .mp4 + String videoSrc = imgSrc.replace("-thumbnail.jpg", ".mp4"); + result.add(videoSrc); + }); + } + + m = POST_PATTERN.matcher(url.toExternalForm()); + if (m.matches()) { + doc.select("video source").forEach(video -> { + result.add(video.attr("src")); + }); + } + + return result; + } + + @Override + public void downloadURL(URL url, int index) { + sleep(2000); + addURLToDownload(url, getPrefix(index)); + } +} diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/FapwizRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/FapwizRipperTest.java new file mode 100644 index 000000000..7b417df75 --- /dev/null +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/FapwizRipperTest.java @@ -0,0 +1,27 @@ +package com.rarchives.ripme.tst.ripper.rippers; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; + +import com.rarchives.ripme.ripper.rippers.FapwizRipper; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +public class FapwizRipperTest extends RippersTest { + @Test + @Tag("flaky") + public void testRip() throws IOException, URISyntaxException { + FapwizRipper ripper = new FapwizRipper(new URI("https://Fapwiz.com/blowjob-bunny-puts-on-a-show/").toURL()); + testRipper(ripper); + } + + @Test + public void testGetGID() throws IOException, URISyntaxException { + URL url = new URI("https://Fapwiz.com/blowjob-bunny-puts-on-a-show/").toURL(); + FapwizRipper ripper = new FapwizRipper(url); + Assertions.assertEquals("blowjob-bunny-puts-on-a-show", ripper.getGID(url)); + } +} From 9b4675719000d6669dba6862fd6f9050f3e7b9c7 Mon Sep 17 00:00:00 2001 From: MetaPrime Date: Sun, 2 Feb 2025 22:53:48 -0800 Subject: [PATCH 2/3] Add FapwizRipperTest, fix bugs, add getNextPage --- .../ripme/ripper/rippers/FapwizRipper.java | 102 ++++++++---- .../tst/ripper/rippers/FapwizRipperTest.java | 148 +++++++++++++++++- 2 files changed, 212 insertions(+), 38 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/FapwizRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/FapwizRipper.java index ad2013830..472ff445c 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/FapwizRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/FapwizRipper.java @@ -8,26 +8,26 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.jsoup.nodes.Document; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.jsoup.nodes.Element; +import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.utils.Http; public class FapwizRipper extends AbstractHTMLRipper { private static final Logger logger = LogManager.getLogger(FapwizRipper.class); - private static final Pattern CATEGORY_PATTERN = - Pattern.compile("https?://fapwiz.com/category/([a-zA-Z1-9_-]+)/?$"); + private static final Pattern CATEGORY_PATTERN = Pattern.compile("https?://fapwiz.com/category/([a-zA-Z0-9_-]+)/?$"); - private static final Pattern USER_PATTERN = - Pattern.compile("https?://fapwiz.com/([a-zA-Z1-9_-]+)/?$"); + private static final Pattern USER_PATTERN = Pattern.compile("https?://fapwiz.com/([a-zA-Z0-9_-]+)/?$"); - private static final Pattern POST_PATTERN = - Pattern.compile("https?://fapwiz.com/([a-zA-Z1-9_-]+)/([a-zA-Z1-9_-]+)/?$"); + // Note that the last part of the pattern can contain unicode emoji which + // get encoded as %-encoded UTF-8 bytes in the URL, so we allow % characters. + private static final Pattern POST_PATTERN = Pattern + .compile("https?://fapwiz.com/([a-zA-Z0-9_-]+)/([a-zA-Z0-9_%-]+)/?$"); public FapwizRipper(URL url) throws IOException { super(url); @@ -59,49 +59,91 @@ public String getGID(URL url) throws MalformedURLException { m = POST_PATTERN.matcher(url.toExternalForm()); if (m.matches()) { - return m.group(1) + "_" + m.group(2); + return "post_" + m.group(1) + "_" + m.group(2); } throw new MalformedURLException("Expected fapwiz URL format: " + - "fapwiz.com/NAME - got " + url + " instead"); + "fapwiz.com/USER or fapwiz.com/USER/POST or " + + "fapwiz.com/CATEGORY - got " + url + " instead"); + } + + void processUserOrCategoryPage(Document doc, List results) { + // The category page looks a lot like the structure of a user page, + // so processUserPage is written to be compatible with both. + doc.select(".post-items-holder img").forEach(e -> { + String imgSrc = e.attr("src"); + + // Skip the user profile picture thumbnail insets + if (imgSrc.endsWith("-thumbnail-icon.jpg")) { + return; + } + + // Replace -thumbnail.jpg with .mp4 + String videoSrc = imgSrc.replace("-thumbnail.jpg", ".mp4"); + results.add(videoSrc); + }); + } + + void processCategoryPage(Document doc, List results) { + logger.info("Processing category page: " + url); + processUserOrCategoryPage(doc, results); + } + + void processUserPage(Document doc, List results) { + logger.info("Processing user page: " + url); + processUserOrCategoryPage(doc, results); + } + + void processPostPage(Document doc, List results) { + logger.info("Processing post page: " + url); + doc.select("video source").forEach(video -> { + results.add(video.attr("src")); + }); } @Override public List getURLsFromPage(Document doc) { - List result = new ArrayList<>(); - + List results = new ArrayList<>(); Matcher m; m = CATEGORY_PATTERN.matcher(url.toExternalForm()); if (m.matches()) { - // structure of user page - does it work for category pages too? - doc.select(".post-items-holder img").forEach(e -> { - String imgSrc = e.attr("src"); - // Replace -thumbnail.jpg with .mp4 - String videoSrc = imgSrc.replace("-thumbnail.jpg", ".mp4"); - result.add(videoSrc); - }); + processCategoryPage(doc, results); } m = USER_PATTERN.matcher(url.toExternalForm()); if (m.matches()) { - // structure of user page - doc.select(".post-items-holder img").forEach(e -> { - String imgSrc = e.attr("src"); - // Replace -thumbnail.jpg with .mp4 - String videoSrc = imgSrc.replace("-thumbnail.jpg", ".mp4"); - result.add(videoSrc); - }); + processUserPage(doc, results); } m = POST_PATTERN.matcher(url.toExternalForm()); if (m.matches()) { - doc.select("video source").forEach(video -> { - result.add(video.attr("src")); - }); + processPostPage(doc, results); } - return result; + return results; + } + + private Document getDocument(String url, int retries) throws IOException { + return Http.url(url).userAgent(USER_AGENT).retries(retries).get(); + } + + private Document getDocument(String url) throws IOException { + return getDocument(url, 1); + } + + @Override + public Document getNextPage(Document page) throws IOException { + logger.info("Getting next page for url: " + url); + Elements next = page.select("a.next"); + if (!next.isEmpty()) { + String href = next.attr("href"); + logger.info("Found next page: " + href); + return getDocument(href); + } else { + logger.info("No more pages"); + throw new IOException("No more pages."); + } } @Override diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/FapwizRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/FapwizRipperTest.java index 7b417df75..6e26d4b17 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/FapwizRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/FapwizRipperTest.java @@ -5,23 +5,155 @@ import java.net.URISyntaxException; import java.net.URL; -import com.rarchives.ripme.ripper.rippers.FapwizRipper; +import org.jsoup.nodes.Document; import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; +import com.rarchives.ripme.ripper.AbstractRipper; +import com.rarchives.ripme.ripper.rippers.FapwizRipper; +import com.rarchives.ripme.utils.Http; +import com.rarchives.ripme.utils.Utils; + public class FapwizRipperTest extends RippersTest { @Test - @Tag("flaky") - public void testRip() throws IOException, URISyntaxException { - FapwizRipper ripper = new FapwizRipper(new URI("https://Fapwiz.com/blowjob-bunny-puts-on-a-show/").toURL()); + public void testGetNextPage_NoNextPage() throws IOException, URISyntaxException { + URL url = new URI("https://fapwiz.com/alison-esha/").toURL(); + FapwizRipper ripper = new FapwizRipper(url); + + Document firstPage = Http.url(url).userAgent(AbstractRipper.USER_AGENT).retries(1).get(); + try { + ripper.getNextPage(firstPage); + // If we don't throw, we failed the text because there *was* a next + // page even though there shouldn't be. + Assertions.fail(); + } catch (IOException exception) { + Assertions.assertTrue(true); + } + } + + @Test + public void testGetNextPage_HasNextPage() throws IOException, URISyntaxException { + URL url = new URI("https://fapwiz.com/miaipanema/").toURL(); + FapwizRipper ripper = new FapwizRipper(url); + + Document firstPage = Http.url(url).userAgent(AbstractRipper.USER_AGENT).retries(1).get(); + try { + Document doc = ripper.getNextPage(firstPage); + Assertions.assertNotNull(doc); + } catch (IOException exception) { + // We should have found a next page but didn't. + Assertions.fail(); + } + } + + @Test + public void testRipPost() throws IOException, URISyntaxException { + URL url = new URI("https://fapwiz.com/petiteasiantravels/riding-at-9-months-pregnant/").toURL(); + FapwizRipper ripper = new FapwizRipper(url); + testRipper(ripper); + } + + @Test + public void testRipPostWithNumbersInUsername1() throws IOException, URISyntaxException { + URL url = new URI("https://fapwiz.com/desperate_bug_7776/lets-be-friends-that-secretly-fuck-thanks/").toURL(); + FapwizRipper ripper = new FapwizRipper(url); + testRipper(ripper); + } + + @Test + public void testRipPostWithEmojiInShortUrl() throws IOException, URISyntaxException { + URL url = new URI("https://fapwiz.com/miaipanema/my-grip-needs-a-name-%f0%9f%a4%ad%f0%9f%91%87%f0%9f%8f%bc/") + .toURL(); + FapwizRipper ripper = new FapwizRipper(url); + testRipper(ripper); + } + + @Test + public void testRipPostWithEmojiInLongUrlAtEnd() throws IOException, URISyntaxException { + URL url = new URI( + "https://fapwiz.com/bimeat1998/just-imagine-youre-out-with-your-girl-and-your-buddies-and-then-she-makes-this-move-%f0%9f%98%8d/") + .toURL(); + FapwizRipper ripper = new FapwizRipper(url); + testRipper(ripper); + } + + @Test + public void testRipPostWithEmojiInLongUrlInTheMiddle() throws IOException, URISyntaxException { + URL url = new URI( + "https://fapwiz.com/miaipanema/new-pov-couch-sex-with-perfect-cumshot-on-my-ass-%f0%9f%92%a6-you-know-where-to-get-it-%f0%9f%94%97%f0%9f%92%96/") + .toURL(); + FapwizRipper ripper = new FapwizRipper(url); testRipper(ripper); } + // TODO Test rip user + + // TODO Test rip category + + @Test + public void testPostGetGID1_Simple() throws IOException, URISyntaxException { + URL url = new URI("https://fapwiz.com/petiteasiantravels/riding-at-9-months-pregnant/").toURL(); + FapwizRipper ripper = new FapwizRipper(url); + Assertions.assertEquals("post_petiteasiantravels_riding-at-9-months-pregnant", ripper.getGID(url)); + } + + // Test Post pages GetGID + + @Test + public void testPostGetGID2_WithEmojiInLongUrlInTheMiddle() throws IOException, URISyntaxException { + URL url = new URI( + "https://fapwiz.com/miaipanema/new-pov-couch-sex-with-perfect-cumshot-on-my-ass-%f0%9f%92%a6-you-know-where-to-get-it-%f0%9f%94%97%f0%9f%92%96/") + .toURL(); + FapwizRipper ripper = new FapwizRipper(url); + + // In this case the filesystem safe version of the GID is + // "post_miaipanema_new-pov-couch-sex-with-perfect-cumshot-on-my-ass-f09f92a6-you-know-where-to-" + // but the GID doesn't truncate and doesn't remove non-filesystem-safe + // characters. + String gid = ripper.getGID(url); + Assertions.assertEquals( + "post_miaipanema_new-pov-couch-sex-with-perfect-cumshot-on-my-ass-%f0%9f%92%a6-you-know-where-to-get-it-%f0%9f%94%97%f0%9f%92%96", + gid); + + // Test directory name on disk (filesystem safe sanitized as the ripper will + // do). + String directoryName = Utils.filesystemSafe(ripper.getHost() + "_" + gid); + Assertions.assertEquals( + "fapwiz_post_miaipanema_new-pov-couch-sex-with-perfect-cumshot-on-my-ass-f09f92a6-you-know-where-to-", + directoryName); + } + + // Test User pages GetGID + + @Test + public void testUserGetGID1_Simple() throws IOException, URISyntaxException { + // Test a "simple" username that is all letters. + URL url = new URI("https://fapwiz.com/petiteasiantravels/").toURL(); + FapwizRipper ripper = new FapwizRipper(url); + Assertions.assertEquals("user_petiteasiantravels", ripper.getGID(url)); + } + + @Test + public void testUserGetGID2_Numbers() throws IOException, URISyntaxException { + // Test a more complex username that contains numbers. + URL url = new URI("https://fapwiz.com/bimeat1998/").toURL(); + FapwizRipper ripper = new FapwizRipper(url); + Assertions.assertEquals("user_bimeat1998", ripper.getGID(url)); + } + + @Test + public void testUserGetGID3_HyphensAndNumbers() throws IOException, URISyntaxException { + // Test a more complex username that contains hyphens and numbers. + URL url = new URI("https://fapwiz.com/used-airport-4076/").toURL(); + FapwizRipper ripper = new FapwizRipper(url); + Assertions.assertEquals("user_used-airport-4076", ripper.getGID(url)); + } + @Test - public void testGetGID() throws IOException, URISyntaxException { - URL url = new URI("https://Fapwiz.com/blowjob-bunny-puts-on-a-show/").toURL(); + public void testUserGetGID4_Underscores() throws IOException, URISyntaxException { + // Test a more complex username that contains underscores. + URL url = new URI("https://fapwiz.com/desperate_bug_7776/").toURL(); FapwizRipper ripper = new FapwizRipper(url); - Assertions.assertEquals("blowjob-bunny-puts-on-a-show", ripper.getGID(url)); + Assertions.assertEquals("user_desperate_bug_7776", ripper.getGID(url)); } } From 376248c076fc2dd6f84ffed868fc3a77afc2e17b Mon Sep 17 00:00:00 2001 From: MetaPrime Date: Mon, 3 Feb 2025 02:03:48 -0800 Subject: [PATCH 3/3] Tag flaky FapwizRipperTest tests --- .../rarchives/ripme/tst/ripper/rippers/FapwizRipperTest.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/FapwizRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/FapwizRipperTest.java index 6e26d4b17..5228e7cdb 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/FapwizRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/FapwizRipperTest.java @@ -7,6 +7,7 @@ import org.jsoup.nodes.Document; import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import com.rarchives.ripme.ripper.AbstractRipper; @@ -16,6 +17,7 @@ public class FapwizRipperTest extends RippersTest { @Test + @Tag("flaky") // It seems like fetching the document within the test can be flaky. public void testGetNextPage_NoNextPage() throws IOException, URISyntaxException { URL url = new URI("https://fapwiz.com/alison-esha/").toURL(); FapwizRipper ripper = new FapwizRipper(url); @@ -32,6 +34,7 @@ public void testGetNextPage_NoNextPage() throws IOException, URISyntaxException } @Test + @Tag("flaky") // It seems like fetching the document within the test can be flaky. public void testGetNextPage_HasNextPage() throws IOException, URISyntaxException { URL url = new URI("https://fapwiz.com/miaipanema/").toURL(); FapwizRipper ripper = new FapwizRipper(url);