Merge pull request #77 from leozqin/unretrievable

Detect unretrievable or banned sites and present a proper user agent
leozqin · Nov 21, 2024 · 104ff57 · 104ff57
2 parents 539ea80 + a975aea
commit 104ff57
Show file tree

Hide file tree

Showing 30 changed files with 479 additions and 272 deletions.
diff --git a/.github/workflows/integration_test.yml b/.github/workflows/integration_test.yml
@@ -5,12 +5,17 @@ on:
         storage_handler:
           required: true
           type: string
+        content_retrieval_handler:
+          required: true
+          type: string
+        llm_handler:
+          required: true
+          type: string
 
 env:
   IMAGE_NAME: precis
 #
 jobs:
-  # This pushes the image to GitHub Packages.
   integration_test:
     runs-on: ubuntu-latest
     permissions:
@@ -31,6 +36,8 @@ jobs:
       - name: test
         run: |
           source .venv/bin/activate
+          mv tests/integration/config/settings-${{ inputs.content_retrieval_handler }}-${{ inputs.llm_handler}}.yml tests/integration/config/settings.yml
+          precis load-settings
           precis load-feeds
           precis check-feeds
           make run-ci

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -5,15 +5,75 @@ on:
       - main
 
 jobs:
-  integration-test-tinydb:
+  integration-test-tinydb-playwright-null:
     uses: ./.github/workflows/integration_test.yml
     with:
       storage_handler: tinydb
-  integration-test-lmdb:
+      content_retrieval_handler: playwright
+      llm_handler: "null"
+  integration-test-lmdb-playwright-null:
     uses: ./.github/workflows/integration_test.yml
     with:
       storage_handler: lmdb
-  integration-test-hybrid:
+      content_retrieval_handler: playwright
+      llm_handler: "null"
+  integration-test-hybrid-playwright-null:
     uses: ./.github/workflows/integration_test.yml
     with:
       storage_handler: hybrid
+      content_retrieval_handler: playwright
+      llm_handler: "null"
+  integration-test-tinydb-requests-null:
+    uses: ./.github/workflows/integration_test.yml
+    with:
+      storage_handler: tinydb
+      content_retrieval_handler: requests
+      llm_handler: "null"
+  integration-test-lmdb-requests-null:
+    uses: ./.github/workflows/integration_test.yml
+    with:
+      storage_handler: lmdb
+      content_retrieval_handler: requests
+      llm_handler: "null"
+  integration-test-hybrid-requests-null:
+    uses: ./.github/workflows/integration_test.yml
+    with:
+      storage_handler: hybrid
+      content_retrieval_handler: requests
+      llm_handler: "null"
+  integration-test-tinydb-playwright-dummy:
+    uses: ./.github/workflows/integration_test.yml
+    with:
+      storage_handler: tinydb
+      content_retrieval_handler: playwright
+      llm_handler: dummy
+  integration-test-lmdb-playwright-dummy:
+    uses: ./.github/workflows/integration_test.yml
+    with:
+      storage_handler: lmdb
+      content_retrieval_handler: playwright
+      llm_handler: dummy
+  integration-test-hybrid-playwright-dummy:
+    uses: ./.github/workflows/integration_test.yml
+    with:
+      storage_handler: hybrid
+      content_retrieval_handler: playwright
+      llm_handler: dummy
+  integration-test-tinydb-requests-dummy:
+    uses: ./.github/workflows/integration_test.yml
+    with:
+      storage_handler: tinydb
+      content_retrieval_handler: requests
+      llm_handler: dummy
+  integration-test-lmdb-requests-dummy:
+    uses: ./.github/workflows/integration_test.yml
+    with:
+      storage_handler: lmdb
+      content_retrieval_handler: requests
+      llm_handler: dummy
+  integration-test-hybrid-requests-dummy:
+    uses: ./.github/workflows/integration_test.yml
+    with:
+      storage_handler: hybrid
+      content_retrieval_handler: requests
+      llm_handler: dummy
diff --git a/README.md b/README.md
@@ -107,7 +107,16 @@ Commands:
   restore        Restore a json-format backup of the Precis state
 ```
 
-## UI Tour
+# Content Ownership
+Precis is meant for use as a personal RSS reader. The content retrieval methodology is basic at best, and I do not have much interest in refining it. So, I think it is unlikely that Precis will become a nuisance content scraper.
+
+Furthermore, we pass a unique user agent of the form `Precis/{version}` so if as a content owner you feel that Precis is acting disruptively, feel free to block that user agent. It will not have destructive impact on users; Precis should detect the rejection and display a link to your website instead of its content.
+
+Finally:
+1. If you'd like to opt-out of content retrieval by Precis, [this file](https://github.com/leozqin/precis/blob/main/app/constants.py) contains a set of globs that should return as banned. Feel free to send a PR with your site, but I reserve final say as to whether your request will be accepted. Expect a more understanding and lenient decision making process for small, independent media/publishers.
+2. If you're of the opinion that Precis should respect `robots.txt`, please thumbs up [this issue](https://github.com/leozqin/precis/issues/79)
+
+# UI Tour
 After initial onboarding, you'll be brought to the feeds page.
 ![The feeds page](app/assets/feeds.png)
 

diff --git a/app/app.py b/app/app.py
@@ -12,11 +12,11 @@
 from fastapi_utils.tasks import repeat_every
 
 from app.backend import PrecisBackend
-from app.context import GlobalSettings, Themes
+from app.impls import load_storage_config
 from app.logging import HealthCheckFilter
 from app.models import Feed, HealthCheck
 from app.rss import PrecisRSS
-from app.storage.engine import load_storage_config
+from app.settings import GlobalSettings, Themes
 
 JSON = "application/json"
 
@@ -332,6 +332,7 @@ async def update_feed(
     notify: Annotated[bool, Form()] = False,
     preview_only: Annotated[bool, Form()] = False,
     refresh_enabled: Annotated[bool, Form()] = False,
+    use_script: Annotated[bool, Form()] = False,
 ):
     try:
         feed = Feed(
@@ -342,6 +343,7 @@ async def update_feed(
             notify_destination=notify_destination,
             preview_only=preview_only,
             refresh_enabled=refresh_enabled,
+            use_script=use_script,
         )
 
         await bk.update_feed(feed=feed)

diff --git a/app/backend.py b/app/backend.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from importlib.metadata import version
 from json import dumps, loads
 from logging import INFO, getLogger
@@ -9,15 +11,15 @@
 from textstat import textstat as txt
 
 from app.constants import GITHUB_LINK, IS_DOCKER
-from app.context import GlobalSettings, StorageHandler
 from app.errors import InvalidFeedException
 from app.models import EntryContent, Feed, FeedEntry, HealthCheck
+from app.settings import GlobalSettings
 
 logger = getLogger("uvicorn.error")
 
 
 class PrecisBackend:
-    def __init__(self, db: Type[StorageHandler]):
+    def __init__(self, db):
         self.db = db
 
     @staticmethod
@@ -122,16 +124,18 @@ async def get_entry_content(self, feed_entry_id, redrive: bool = False):
             content: EntryContent = await self.db.get_entry_content(
                 entry=entry, redrive=redrive
             )
-            word_count = txt.lexicon_count(content.content)
+            logger.debug(f"Received EntryContent: {content}")
+            txt_content = content.content if content.content else ""
+            word_count = txt.lexicon_count(txt_content)
             return {
                 **base,
+                "unretrievable": content.unretrievable,
+                "banned": content.banned,
                 "preview": None,
                 "content": content.content,
                 "summary": content.summary,
                 "word_count": word_count,
-                "reading_level": int(
-                    txt.text_standard(content.content, float_output=True)
-                ),
+                "reading_level": int(txt.text_standard(txt_content, float_output=True)),
                 "reading_time": int(word_count / settings.reading_speed),
             }
 
@@ -215,18 +219,18 @@ async def delete_feed(self, feed_id: str):
 
     @staticmethod
     async def list_content_handler_choices():
-        from app.content import content_retrieval_handlers
+        from app.impls import content_retrieval_handlers
 
         return list(content_retrieval_handlers.keys())
 
     @staticmethod
     async def list_llm_handler_choices():
-        from app.llm import llm_handlers
+        from app.impls import llm_handlers
 
         return list(llm_handlers.keys())
 
     @staticmethod
     async def list_notification_handler_choices():
-        from app.notification import notification_handlers
+        from app.impls import notification_handlers
 
         return list(notification_handlers.keys())
diff --git a/app/constants.py b/app/constants.py
@@ -1,3 +1,4 @@
+from importlib.metadata import version
 from os import environ
 from pathlib import Path
 
@@ -6,3 +7,11 @@
 IS_DOCKER = bool(environ.get("IS_DOCKER", False))
 # overrride this if you feel it's important to point to your fork
 GITHUB_LINK = environ.get("GITHUB_LINK", "https://github.com/leozqin/precis")
+
+USER_AGENT = f"Precis/{version('precis')}"
+BANNED_GLOBS = [
+    "*x.com/*",
+    "*twitter.com/*" "*reddit.com/*",
+    "*youtube.com/*",
+    "*notion.site/*",
+]
diff --git a/app/content/__init__.py b/app/content/__init__.py
@@ -1,9 +0,0 @@
-from enum import Enum
-
-from app.content.playwright import PlaywrightContentRetriever
-from app.content.requests import RequestsContentRetriever
-
-content_retrieval_handlers = {
-    "requests": RequestsContentRetriever,
-    "playwright": PlaywrightContentRetriever,
-}

diff --git a/app/content/playwright.py b/app/content/playwright.py
@@ -1,31 +1,52 @@
+from __future__ import annotations
+
+from logging import getLogger
+
 from playwright.async_api import Playwright, Route, async_playwright
 
+from app.constants import USER_AGENT
 from app.handlers import ContentRetrievalHandler
 
+logger = getLogger("uvicorn.error")
+
 
 class PlaywrightContentRetriever(ContentRetrievalHandler):
     id = "playwright"
 
+    @staticmethod
+    async def _block_common_with_script(route: Route):
+        excluded_resource_types = ["stylesheet", "image", "font"]
+        if route.request.resource_type in excluded_resource_types:
+            await route.abort()
+        else:
+            await route.continue_()
+
     @staticmethod
     async def _block_common(route: Route):
-        excluded_resource_types = ["stylesheet", "script", "image", "font"]
+        excluded_resource_types = ["stylesheet", "image", "font", "script"]
         if route.request.resource_type in excluded_resource_types:
             await route.abort()
         else:
             await route.continue_()
 
     @staticmethod
-    async def _retrieve(url: str, playright: Playwright):
+    async def _retrieve(url: str, playright: Playwright, use_script: bool = False):
         browser = await playright.chromium.launch()
-        page = await browser.new_page()
+        page = await browser.new_page(user_agent=USER_AGENT)
+
+        retriever = (
+            PlaywrightContentRetriever._block_common_with_script
+            if use_script
+            else PlaywrightContentRetriever._block_common
+        )
 
-        await page.route("**/*", PlaywrightContentRetriever._block_common)
+        await page.route("**/*", retriever)
         await page.goto(url)
 
         await page.wait_for_load_state("domcontentloaded")
 
         return await page.content()
 
-    async def get_content(self, url: str) -> str:
+    async def get_html(self, url: str, use_script: bool = False) -> str:
         async with async_playwright() as pw:
-            return await self._retrieve(url=url, playright=pw)
+            return await self._retrieve(url=url, playright=pw, use_script=use_script)
diff --git a/app/content/requests.py b/app/content/requests.py
@@ -1,13 +1,21 @@
 import requests
 
+from app.constants import USER_AGENT
 from app.handlers import ContentRetrievalHandler
+from app.models import EntryContent, FeedEntry
 
 
 class RequestsContentRetriever(ContentRetrievalHandler):
     id = "requests"
+    headers = {"User-Agent": USER_AGENT}
 
-    async def get_content(self, url: str) -> str:
-
-        page = requests.get(url)
-
-        return page.text
+    # requests does not implement the use_script option so we'll just ignore it
+    async def get_html(self, url: str, use_script: bool = False) -> str:
+        try:
+            page = requests.get(url, headers=self.headers)
+            if page.text == "":
+                return
+            else:
+                return page.text
+        except:
+            return