diff --git a/.github/workflows/integration_test.yml b/.github/workflows/integration_test.yml index bbc1ca2..6af0938 100644 --- a/.github/workflows/integration_test.yml +++ b/.github/workflows/integration_test.yml @@ -5,12 +5,17 @@ on: storage_handler: required: true type: string + content_retrieval_handler: + required: true + type: string + llm_handler: + required: true + type: string env: IMAGE_NAME: precis # jobs: - # This pushes the image to GitHub Packages. integration_test: runs-on: ubuntu-latest permissions: @@ -31,6 +36,8 @@ jobs: - name: test run: | source .venv/bin/activate + mv tests/integration/config/settings-${{ inputs.content_retrieval_handler }}-${{ inputs.llm_handler}}.yml tests/integration/config/settings.yml + precis load-settings precis load-feeds precis check-feeds make run-ci diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 25b9c4b..4dc5f05 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,15 +5,75 @@ on: - main jobs: - integration-test-tinydb: + integration-test-tinydb-playwright-null: uses: ./.github/workflows/integration_test.yml with: storage_handler: tinydb - integration-test-lmdb: + content_retrieval_handler: playwright + llm_handler: "null" + integration-test-lmdb-playwright-null: uses: ./.github/workflows/integration_test.yml with: storage_handler: lmdb - integration-test-hybrid: + content_retrieval_handler: playwright + llm_handler: "null" + integration-test-hybrid-playwright-null: uses: ./.github/workflows/integration_test.yml with: storage_handler: hybrid + content_retrieval_handler: playwright + llm_handler: "null" + integration-test-tinydb-requests-null: + uses: ./.github/workflows/integration_test.yml + with: + storage_handler: tinydb + content_retrieval_handler: requests + llm_handler: "null" + integration-test-lmdb-requests-null: + uses: ./.github/workflows/integration_test.yml + with: + storage_handler: lmdb + content_retrieval_handler: requests + llm_handler: "null" + integration-test-hybrid-requests-null: + uses: ./.github/workflows/integration_test.yml + with: + storage_handler: hybrid + content_retrieval_handler: requests + llm_handler: "null" + integration-test-tinydb-playwright-dummy: + uses: ./.github/workflows/integration_test.yml + with: + storage_handler: tinydb + content_retrieval_handler: playwright + llm_handler: dummy + integration-test-lmdb-playwright-dummy: + uses: ./.github/workflows/integration_test.yml + with: + storage_handler: lmdb + content_retrieval_handler: playwright + llm_handler: dummy + integration-test-hybrid-playwright-dummy: + uses: ./.github/workflows/integration_test.yml + with: + storage_handler: hybrid + content_retrieval_handler: playwright + llm_handler: dummy + integration-test-tinydb-requests-dummy: + uses: ./.github/workflows/integration_test.yml + with: + storage_handler: tinydb + content_retrieval_handler: requests + llm_handler: dummy + integration-test-lmdb-requests-dummy: + uses: ./.github/workflows/integration_test.yml + with: + storage_handler: lmdb + content_retrieval_handler: requests + llm_handler: dummy + integration-test-hybrid-requests-dummy: + uses: ./.github/workflows/integration_test.yml + with: + storage_handler: hybrid + content_retrieval_handler: requests + llm_handler: dummy diff --git a/README.md b/README.md index 7691fce..52b3d10 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,16 @@ Commands: restore Restore a json-format backup of the Precis state ``` -## UI Tour +# Content Ownership +Precis is meant for use as a personal RSS reader. The content retrieval methodology is basic at best, and I do not have much interest in refining it. So, I think it is unlikely that Precis will become a nuisance content scraper. + +Furthermore, we pass a unique user agent of the form `Precis/{version}` so if as a content owner you feel that Precis is acting disruptively, feel free to block that user agent. It will not have destructive impact on users; Precis should detect the rejection and display a link to your website instead of its content. + +Finally: +1. If you'd like to opt-out of content retrieval by Precis, [this file](https://github.com/leozqin/precis/blob/main/app/constants.py) contains a set of globs that should return as banned. Feel free to send a PR with your site, but I reserve final say as to whether your request will be accepted. Expect a more understanding and lenient decision making process for small, independent media/publishers. +2. If you're of the opinion that Precis should respect `robots.txt`, please thumbs up [this issue](https://github.com/leozqin/precis/issues/79) + +# UI Tour After initial onboarding, you'll be brought to the feeds page. ![The feeds page](app/assets/feeds.png) diff --git a/app/app.py b/app/app.py index 82ae053..a65bc4a 100644 --- a/app/app.py +++ b/app/app.py @@ -12,11 +12,11 @@ from fastapi_utils.tasks import repeat_every from app.backend import PrecisBackend -from app.context import GlobalSettings, Themes +from app.impls import load_storage_config from app.logging import HealthCheckFilter from app.models import Feed, HealthCheck from app.rss import PrecisRSS -from app.storage.engine import load_storage_config +from app.settings import GlobalSettings, Themes JSON = "application/json" @@ -332,6 +332,7 @@ async def update_feed( notify: Annotated[bool, Form()] = False, preview_only: Annotated[bool, Form()] = False, refresh_enabled: Annotated[bool, Form()] = False, + use_script: Annotated[bool, Form()] = False, ): try: feed = Feed( @@ -342,6 +343,7 @@ async def update_feed( notify_destination=notify_destination, preview_only=preview_only, refresh_enabled=refresh_enabled, + use_script=use_script, ) await bk.update_feed(feed=feed) diff --git a/app/backend.py b/app/backend.py index e65acf6..c7b1942 100644 --- a/app/backend.py +++ b/app/backend.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from importlib.metadata import version from json import dumps, loads from logging import INFO, getLogger @@ -9,15 +11,15 @@ from textstat import textstat as txt from app.constants import GITHUB_LINK, IS_DOCKER -from app.context import GlobalSettings, StorageHandler from app.errors import InvalidFeedException from app.models import EntryContent, Feed, FeedEntry, HealthCheck +from app.settings import GlobalSettings logger = getLogger("uvicorn.error") class PrecisBackend: - def __init__(self, db: Type[StorageHandler]): + def __init__(self, db): self.db = db @staticmethod @@ -122,16 +124,18 @@ async def get_entry_content(self, feed_entry_id, redrive: bool = False): content: EntryContent = await self.db.get_entry_content( entry=entry, redrive=redrive ) - word_count = txt.lexicon_count(content.content) + logger.debug(f"Received EntryContent: {content}") + txt_content = content.content if content.content else "" + word_count = txt.lexicon_count(txt_content) return { **base, + "unretrievable": content.unretrievable, + "banned": content.banned, "preview": None, "content": content.content, "summary": content.summary, "word_count": word_count, - "reading_level": int( - txt.text_standard(content.content, float_output=True) - ), + "reading_level": int(txt.text_standard(txt_content, float_output=True)), "reading_time": int(word_count / settings.reading_speed), } @@ -215,18 +219,18 @@ async def delete_feed(self, feed_id: str): @staticmethod async def list_content_handler_choices(): - from app.content import content_retrieval_handlers + from app.impls import content_retrieval_handlers return list(content_retrieval_handlers.keys()) @staticmethod async def list_llm_handler_choices(): - from app.llm import llm_handlers + from app.impls import llm_handlers return list(llm_handlers.keys()) @staticmethod async def list_notification_handler_choices(): - from app.notification import notification_handlers + from app.impls import notification_handlers return list(notification_handlers.keys()) diff --git a/app/constants.py b/app/constants.py index 22a49a2..afbd4cb 100644 --- a/app/constants.py +++ b/app/constants.py @@ -1,3 +1,4 @@ +from importlib.metadata import version from os import environ from pathlib import Path @@ -6,3 +7,11 @@ IS_DOCKER = bool(environ.get("IS_DOCKER", False)) # overrride this if you feel it's important to point to your fork GITHUB_LINK = environ.get("GITHUB_LINK", "https://github.com/leozqin/precis") + +USER_AGENT = f"Precis/{version('precis')}" +BANNED_GLOBS = [ + "*x.com/*", + "*twitter.com/*" "*reddit.com/*", + "*youtube.com/*", + "*notion.site/*", +] diff --git a/app/content/__init__.py b/app/content/__init__.py index a9c0ced..e69de29 100644 --- a/app/content/__init__.py +++ b/app/content/__init__.py @@ -1,9 +0,0 @@ -from enum import Enum - -from app.content.playwright import PlaywrightContentRetriever -from app.content.requests import RequestsContentRetriever - -content_retrieval_handlers = { - "requests": RequestsContentRetriever, - "playwright": PlaywrightContentRetriever, -} diff --git a/app/content/playwright.py b/app/content/playwright.py index 13c5fbc..8afe4fa 100644 --- a/app/content/playwright.py +++ b/app/content/playwright.py @@ -1,31 +1,52 @@ +from __future__ import annotations + +from logging import getLogger + from playwright.async_api import Playwright, Route, async_playwright +from app.constants import USER_AGENT from app.handlers import ContentRetrievalHandler +logger = getLogger("uvicorn.error") + class PlaywrightContentRetriever(ContentRetrievalHandler): id = "playwright" + @staticmethod + async def _block_common_with_script(route: Route): + excluded_resource_types = ["stylesheet", "image", "font"] + if route.request.resource_type in excluded_resource_types: + await route.abort() + else: + await route.continue_() + @staticmethod async def _block_common(route: Route): - excluded_resource_types = ["stylesheet", "script", "image", "font"] + excluded_resource_types = ["stylesheet", "image", "font", "script"] if route.request.resource_type in excluded_resource_types: await route.abort() else: await route.continue_() @staticmethod - async def _retrieve(url: str, playright: Playwright): + async def _retrieve(url: str, playright: Playwright, use_script: bool = False): browser = await playright.chromium.launch() - page = await browser.new_page() + page = await browser.new_page(user_agent=USER_AGENT) + + retriever = ( + PlaywrightContentRetriever._block_common_with_script + if use_script + else PlaywrightContentRetriever._block_common + ) - await page.route("**/*", PlaywrightContentRetriever._block_common) + await page.route("**/*", retriever) await page.goto(url) await page.wait_for_load_state("domcontentloaded") return await page.content() - async def get_content(self, url: str) -> str: + async def get_html(self, url: str, use_script: bool = False) -> str: async with async_playwright() as pw: - return await self._retrieve(url=url, playright=pw) + return await self._retrieve(url=url, playright=pw, use_script=use_script) diff --git a/app/content/requests.py b/app/content/requests.py index 3ab64cc..2c991b3 100644 --- a/app/content/requests.py +++ b/app/content/requests.py @@ -1,13 +1,21 @@ import requests +from app.constants import USER_AGENT from app.handlers import ContentRetrievalHandler +from app.models import EntryContent, FeedEntry class RequestsContentRetriever(ContentRetrievalHandler): id = "requests" + headers = {"User-Agent": USER_AGENT} - async def get_content(self, url: str) -> str: - - page = requests.get(url) - - return page.text + # requests does not implement the use_script option so we'll just ignore it + async def get_html(self, url: str, use_script: bool = False) -> str: + try: + page = requests.get(url, headers=self.headers) + if page.text == "": + return + else: + return page.text + except: + return diff --git a/app/context.py b/app/db.py similarity index 58% rename from app/context.py rename to app/db.py index 9ac4b50..f8e91c0 100644 --- a/app/context.py +++ b/app/db.py @@ -1,109 +1,18 @@ from __future__ import annotations from abc import ABC, abstractmethod -from enum import Enum from logging import getLogger -from typing import Any, List, Mapping, Optional, Type - -from markdown2 import markdown -from pydantic import BaseModel, Field, validator -from readabilipy import simple_json_from_html_string - -from app.content import content_retrieval_handlers -from app.handlers import ( - ContentRetrievalHandler, - HandlerBase, - LLMHandler, - NotificationHandler, -) -from app.llm import llm_handlers +from typing import List, Mapping, Optional, Type + +from app.handlers import HandlerBase from app.models import * -from app.notification import notification_handlers - - -class Themes(str, Enum): - black = "black" - coffee = "coffee" - dark = "dark" - fantasy = "fantasy" - forest = "forest" - lemonade = "lemonade" - lofi = "lofi" - luxury = "luxury" - night = "night" - nord = "nord" - pastel = "pastel" - synthwave = "synthwave" - winter = "winter" - - -class GlobalSettings(BaseModel): - - send_notification: bool = True - theme: Themes = Themes.forest - refresh_interval: int = 5 - reading_speed: int = 238 - - notification_handler_key: str = "null_notification" - llm_handler_key: str = "null_llm" - content_retrieval_handler_key: str = "playwright" - recent_hours: int = 36 - - finished_onboarding: bool = False - - db: Any = Field(exclude=True) - - @validator("db") - def validate_db(cls, val): - if issubclass(type(val), StorageHandler): - return val - - raise TypeError("Wrong type for db, must be subclass of StorageHandler") - - @property - def notification_handler(self) -> NotificationHandler: - try: - return self.db.get_handler(id=self.notification_handler_key) - except IndexError: - return self.db.handler_map[self.notification_handler_key]() - - @property - def llm_handler(self) -> LLMHandler: - try: - return self.db.get_handler(id=self.llm_handler_key) - except IndexError: - return self.db.handler_map[self.llm_handler_key]() - - @property - def content_retrieval_handler(self) -> ContentRetrievalHandler: - try: - return self.db.get_handler(id=self.content_retrieval_handler_key) - except IndexError: - return self.db.handler_map[self.content_retrieval_handler_key]() +from app.settings import GlobalSettings class StorageHandler(ABC): logger = getLogger("uvicorn.error") - handler_map = { - **llm_handlers, - **notification_handlers, - **content_retrieval_handlers, - } - - engine_map = { - "llm": llm_handlers, - "notification": notification_handlers, - "content": content_retrieval_handlers, - } - - handler_type_map = { - **{k: "llm" for k in llm_handlers.keys()}, - **{k: "notification" for k in notification_handlers.keys()}, - **{k: "content" for k in content_retrieval_handlers.keys()}, - } - def reconfigure_handler(self, id: str, config: Mapping) -> Type[HandlerBase]: return self.handler_map[id](**config) @@ -211,24 +120,25 @@ def feed_entry_exists(self, id: str) -> bool: pass @abstractmethod - async def get_entry_content( - self, entry: FeedEntry, redrive: bool = False - ) -> EntryContent: + async def upsert_entry_content(self, content: EntryContent): """ - Given a feed entry, return the EntryContent object for that entry - if one exists. If the redrive argument is true or if none exists, - create a new one using the URL of the feed entry and add it to the - database using upsert_entry_content. Use the get_main_content - static method for the class to clean the content as needed. Use the - summarize static method for the class to build the summary. + Given an EntryContent object, insert it into the database. """ pass @abstractmethod - async def upsert_entry_content(self, content: EntryContent): + def entry_content_exists(self, entry: FeedEntry) -> bool: """ - Given an EntryContent object, insert it into the database. + Return true if entry content exists for the FeedEntry else False """ + pass + + @abstractmethod + def retrieve_entry_content(self, entry: FeedEntry) -> EntryContent: + """ + Retrieve the content for the feed entry from storage + """ + pass @abstractmethod def upsert_handler( @@ -289,22 +199,33 @@ def delete_feed_entry(self, feed_entry: FeedEntry) -> None: """ pass - @staticmethod - async def get_entry_html(url: str, settings: GlobalSettings) -> str: - return await settings.content_retrieval_handler.get_content(url) + async def get_content(self, entry: FeedEntry) -> EntryContent: + + feed = self.get_feed(entry.feed_id) + self.logger.debug(f"Found feed {feed} for entry {entry}") + settings = self.get_settings() + summarizer = settings.llm_handler.summarize - @staticmethod - def get_main_content(content: str) -> str: - md = simple_json_from_html_string(html=content, use_readability=True) + content = await settings.content_retrieval_handler.get_content( + feed=feed, entry=entry, summarizer=summarizer + ) + self.logger.debug(f"Received content {content}") + + return content + + async def get_entry_content( + self, entry: FeedEntry, redrive: bool = False + ) -> EntryContent: - return md["plain_content"] + if self.entry_content_exists(entry) and not redrive: + return self.retrieve_entry_content(entry=entry) - @staticmethod - def summarize( - feed: Feed, entry: FeedEntry, mk: str, settings: GlobalSettings - ) -> str: + else: + if redrive: + self.logger.info(f"starting redrive for feed entry {entry.id}") - summary = settings.llm_handler.summarize(feed=feed, entry=entry, mk=mk) + self.logger.debug(f"Getting content for entry {type(entry)}: {entry}") + entry_content = await self.get_content(entry=entry) + await self.upsert_entry_content(entry_content) - if summary: - return markdown(summary) + return entry_content diff --git a/app/handlers.py b/app/handlers.py index 477aadc..7c3556b 100644 --- a/app/handlers.py +++ b/app/handlers.py @@ -1,10 +1,19 @@ +from __future__ import annotations + from abc import ABC, abstractmethod +from fnmatch import fnmatch +from logging import getLogger from os import environ -from typing import ClassVar +from typing import Callable, ClassVar +from markdown2 import markdown from pydantic import BaseModel +from readabilipy import simple_json_from_html_string + +from app.constants import BANNED_GLOBS +from app.models import EntryContent, Feed, FeedEntry -from app.models import Feed, FeedEntry +logger = getLogger("uvicorn.error") class HandlerBase(BaseModel, ABC): @@ -15,9 +24,50 @@ class ContentRetrievalHandler(HandlerBase): id: ClassVar[str] = "generic_content_retrieval_handler" @abstractmethod - async def get_content(self, url: str) -> str: + async def get_html(self, url, use_script: bool) -> str: pass + async def get_content( + self, + entry: FeedEntry, + feed: Feed, + summarizer: Callable[[Feed, FeedEntry, str], str], + ) -> EntryContent: + if await self.is_banned(entry.url): + logger.info(f"Found banned entry from url {entry.url}") + return EntryContent(url=entry.url, banned=True) + + try: + html = await self.get_html(url=entry.url, use_script=feed.use_script) + content = self.get_main_content(content=html) + if not html or not content: + return EntryContent(url=entry.url, unretrievable=True) + else: + summary = summarizer(feed=feed, entry=entry, mk=content) + + return EntryContent( + url=entry.url, + content=content, + summary=markdown(summary) if summary else None, + unretrievable=True if content else False, + ) + + except Exception as e: + logger.warning( + f"Encountered retrieval exception, returning unretrievable: {e}" + ) + return EntryContent(url=entry.url, unretrievable=True) + + @staticmethod + async def is_banned(url) -> bool: + return any(fnmatch(url, i) for i in BANNED_GLOBS) + + @staticmethod + def get_main_content(content: str) -> str: + md = simple_json_from_html_string(html=content, use_readability=True) + + return md["plain_content"] + class NotificationHandler(HandlerBase): id: ClassVar[str] = "generic_notification_handler" diff --git a/app/impls.py b/app/impls.py new file mode 100644 index 0000000..c95936c --- /dev/null +++ b/app/impls.py @@ -0,0 +1,89 @@ +from logging import getLogger +from os import environ +from typing import Union + +from app.content.playwright import PlaywrightContentRetriever +from app.content.requests import RequestsContentRetriever +from app.llm.dummy import DummyLLMHandler +from app.llm.null import NullLLMHandler +from app.llm.ollama import OllamaLLMHandler +from app.llm.openai import OpenAILLMHandler +from app.notification.jira import JiraNotificationHandler +from app.notification.matrix import MatrixNotificationHandler +from app.notification.ntfy import NtfyNotificationHandler +from app.notification.null import NullNotificationHandler +from app.notification.slack import SlackNotificationHandler +from app.storage.hybrid import HybridLMDBOfflineStorageHandler +from app.storage.lmdb import LMDBStorageHandler +from app.storage.tinydb import TinyDBStorageHandler + +logger = getLogger("uvicorn.error") + +storage_handlers = { + "tinydb": TinyDBStorageHandler, + "lmdb": LMDBStorageHandler, + "hybrid": HybridLMDBOfflineStorageHandler, +} + +notification_handlers = { + "matrix": MatrixNotificationHandler, + "null_notification": NullNotificationHandler, + "slack": SlackNotificationHandler, + "jira": JiraNotificationHandler, + "ntfy": NtfyNotificationHandler, +} + +content_retrieval_handlers = { + "requests": RequestsContentRetriever, + "playwright": PlaywrightContentRetriever, +} + +llm_handlers = { + NullLLMHandler.id: NullLLMHandler, + OllamaLLMHandler.id: OllamaLLMHandler, + OpenAILLMHandler.id: OpenAILLMHandler, + DummyLLMHandler.id: DummyLLMHandler, + # redirect null summarization handler to null llm + # TODO: Deprecate + "null_summarization": NullLLMHandler, +} + + +class ImplMixin: + handler_map = { + **llm_handlers, + **notification_handlers, + **content_retrieval_handlers, + } + + engine_map = { + "llm": llm_handlers, + "notification": notification_handlers, + "content": content_retrieval_handlers, + } + + handler_type_map = { + **{k: "llm" for k in llm_handlers.keys()}, + **{k: "notification" for k in notification_handlers.keys()}, + **{k: "content" for k in content_retrieval_handlers.keys()}, + } + + +def load_storage_config() -> ( + Union[TinyDBStorageHandler, LMDBStorageHandler, HybridLMDBOfflineStorageHandler] +): + + config_type = environ.get("PRECIS_STORAGE_HANDLER", "tinydb") + handler_type = storage_handlers.get(config_type) + handler = handler_type() + + # for the purpose of managing settings, the db handler needs to know about + # implementations of other handlers. Here, we modify the signature of the chosen + # handler to include the other handler impls. Doing it this way avoids creating a + # circular dependency + handler_cls_name = handler.__class__.__name__ + handler.__class__ = type(handler_cls_name, (handler_type, ImplMixin), {}) + + logger.info(f"loading storage handler of type {config_type}") + + return handler diff --git a/app/llm/__init__.py b/app/llm/__init__.py index 01c91a7..e69de29 100644 --- a/app/llm/__init__.py +++ b/app/llm/__init__.py @@ -1,12 +0,0 @@ -from app.llm.null import NullLLMHandler -from app.llm.ollama import OllamaLLMHandler -from app.llm.openai import OpenAILLMHandler - -llm_handlers = { - NullLLMHandler.id: NullLLMHandler, - OllamaLLMHandler.id: OllamaLLMHandler, - OpenAILLMHandler.id: OpenAILLMHandler, - # redirect null summarization handler to null llm - # TODO: Deprecate - "null_summarization": NullLLMHandler, -} diff --git a/app/llm/dummy.py b/app/llm/dummy.py new file mode 100644 index 0000000..b9efb89 --- /dev/null +++ b/app/llm/dummy.py @@ -0,0 +1,14 @@ +from typing import ClassVar + +from pydantic import BaseModel + +from app.handlers import Feed, FeedEntry, LLMHandler + + +# An LLM Handler that doesn't return anything meaningful, but does return +# something, unlike null. Useful for testing or pranking your friends. +class DummyLLMHandler(LLMHandler, BaseModel): + id: ClassVar[str] = "dummy_llm" + + def summarize(self, feed: Feed, entry: FeedEntry, mk: str): + return "cool story bro" diff --git a/app/models.py b/app/models.py index 85ada65..a591057 100644 --- a/app/models.py +++ b/app/models.py @@ -14,6 +14,7 @@ class Feed(BaseModel): notify: bool = True preview_only: bool = False refresh_enabled: bool = True + use_script: bool = False @property def rss(self) -> Type[FeedParserDict]: @@ -45,6 +46,8 @@ class EntryContent(BaseModel): url: str content: str = None summary: str = None + unretrievable: bool = False + banned: bool = False @property def id(self) -> str: diff --git a/app/notification/__init__.py b/app/notification/__init__.py index 2ac46b4..e69de29 100644 --- a/app/notification/__init__.py +++ b/app/notification/__init__.py @@ -1,15 +0,0 @@ -from enum import Enum - -from app.notification.jira import JiraNotificationHandler -from app.notification.matrix import MatrixNotificationHandler -from app.notification.ntfy import NtfyNotificationHandler -from app.notification.null import NullNotificationHandler -from app.notification.slack import SlackNotificationHandler - -notification_handlers = { - "matrix": MatrixNotificationHandler, - "null_notification": NullNotificationHandler, - "slack": SlackNotificationHandler, - "jira": JiraNotificationHandler, - "ntfy": NtfyNotificationHandler, -} diff --git a/app/rss.py b/app/rss.py index 6ab0c3d..8c71b39 100644 --- a/app/rss.py +++ b/app/rss.py @@ -10,14 +10,14 @@ from ruamel.yaml import YAML from app.constants import CONFIG_DIR, DATA_DIR -from app.context import GlobalSettings, StorageHandler from app.models import EntryContent, Feed, FeedEntry +from app.settings import GlobalSettings logger = getLogger("uvicorn.error") class PrecisRSS: - def __init__(self, db: Type[StorageHandler]) -> None: + def __init__(self, db) -> None: self.db = db def load_feeds(self) -> None: diff --git a/app/settings.py b/app/settings.py new file mode 100644 index 0000000..f2b43de --- /dev/null +++ b/app/settings.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from enum import Enum +from typing import Any + +from pydantic import BaseModel, Field, validator + +from app.handlers import ContentRetrievalHandler, LLMHandler, NotificationHandler + + +class Themes(str, Enum): + black = "black" + coffee = "coffee" + dark = "dark" + fantasy = "fantasy" + forest = "forest" + lemonade = "lemonade" + lofi = "lofi" + luxury = "luxury" + night = "night" + nord = "nord" + pastel = "pastel" + synthwave = "synthwave" + winter = "winter" + + +class GlobalSettings(BaseModel): + + send_notification: bool = True + theme: Themes = Themes.forest + refresh_interval: int = 5 + reading_speed: int = 238 + + notification_handler_key: str = "null_notification" + llm_handler_key: str = "null_llm" + content_retrieval_handler_key: str = "playwright" + recent_hours: int = 36 + + finished_onboarding: bool = False + + db: Any = Field(exclude=True) + + @validator("db") + def validate_db(cls, val): + from app.db import StorageHandler + + if issubclass(type(val), StorageHandler): + return val + + raise TypeError("Wrong type for db, must be subclass of StorageHandler") + + @property + def notification_handler(self) -> NotificationHandler: + try: + return self.db.get_handler(id=self.notification_handler_key) + except IndexError: + return self.db.handler_map[self.notification_handler_key]() + + @property + def llm_handler(self) -> LLMHandler: + try: + return self.db.get_handler(id=self.llm_handler_key) + except IndexError: + return self.db.handler_map[self.llm_handler_key]() + + @property + def content_retrieval_handler(self) -> ContentRetrievalHandler: + try: + return self.db.get_handler(id=self.content_retrieval_handler_key) + except IndexError: + return self.db.handler_map[self.content_retrieval_handler_key]() diff --git a/app/storage/__init__.py b/app/storage/__init__.py index 16c9470..09f093c 100644 --- a/app/storage/__init__.py +++ b/app/storage/__init__.py @@ -1,16 +1,4 @@ -from pathlib import Path - from app.constants import DATA_DIR # ensure data dir exists DATA_DIR.mkdir(parents=True, exist_ok=True) - -from app.storage.hybrid import HybridLMDBOfflineStorageHandler -from app.storage.lmdb import LMDBStorageHandler -from app.storage.tinydb import TinyDBStorageHandler - -storage_handlers = { - "tinydb": TinyDBStorageHandler, - "lmdb": LMDBStorageHandler, - "hybrid": HybridLMDBOfflineStorageHandler, -} diff --git a/app/storage/engine.py b/app/storage/engine.py index b8a20ff..f588c1b 100644 --- a/app/storage/engine.py +++ b/app/storage/engine.py @@ -2,8 +2,8 @@ from os import environ from typing import Type -from app.context import StorageHandler -from app.storage import storage_handlers +from app.db import StorageHandler +from app.impls import storage_handlers logger = getLogger("uvicorn.error") diff --git a/app/storage/hybrid.py b/app/storage/hybrid.py index 40ffc60..0a7de7e 100644 --- a/app/storage/hybrid.py +++ b/app/storage/hybrid.py @@ -1,3 +1,4 @@ +from logging import getLogger from os import remove from pathlib import Path from pickle import dump, load @@ -7,6 +8,8 @@ from app.models import EntryContent, FeedEntry from app.storage.lmdb import LMDBStorageHandler +logger = getLogger("uvicorn.error") + class HybridLMDBOfflineStorageHandler(LMDBStorageHandler): def __init__(self) -> None: @@ -16,6 +19,7 @@ def __init__(self) -> None: self.offline_media_path.mkdir(parents=True, exist_ok=True) def _content_path(self, content: Union[EntryContent, FeedEntry]): + logger.debug(f"Making content path for {content}") return self.offline_media_path.joinpath(content.id).with_suffix(".pickle") async def upsert_entry_content(self, content: EntryContent): diff --git a/app/storage/lmdb.py b/app/storage/lmdb.py index 294199f..78fffeb 100644 --- a/app/storage/lmdb.py +++ b/app/storage/lmdb.py @@ -1,16 +1,19 @@ +from __future__ import annotations + from enum import Enum from json import JSONDecodeError, dumps, loads from logging import getLogger from pathlib import Path -from typing import Any, List, Mapping, Type +from typing import Any, List, Mapping -from lmdb import Environment, Transaction +from lmdb import Environment from pydantic import BaseModel from app.constants import DATA_DIR -from app.context import GlobalSettings, StorageHandler +from app.db import StorageHandler from app.handlers import HandlerBase -from app.models import EntryContent, Feed, FeedEntry, Type +from app.models import EntryContent, Feed, FeedEntry +from app.settings import GlobalSettings logger = getLogger("uvicorn.error") @@ -210,38 +213,6 @@ def retrieve_entry_content(self, entry: FeedEntry): content = txn.get(self._serialize(entry.id)) return EntryContent(**self._deserialize(content)) - async def get_entry_content( - self, entry: FeedEntry, redrive: bool = False - ) -> EntryContent: - - if self.entry_content_exists(entry) and not redrive: - return self.retrieve_entry_content(entry=entry) - - else: - if redrive: - self.logger.info(f"starting redrive for feed entry {entry.id}") - - settings = self.get_settings() - - raw_content = await self.get_entry_html(entry.url, settings=settings) - content = self.get_main_content(content=raw_content) - - feed = self.get_feed(entry.feed_id) - - summary = self.summarize( - feed=feed, entry=entry, mk=content, settings=settings - ) - - entry_content = EntryContent( - url=entry.url, - content=content, - summary=summary if summary else None, - ) - - await self.upsert_entry_content(entry_content) - - return entry_content - async def upsert_entry_content(self, content: EntryContent): with self.db.begin(db=self._db(Named.entry_content), write=True) as txn: diff --git a/app/storage/tinydb.py b/app/storage/tinydb.py index 3ee7f92..2a9c75a 100644 --- a/app/storage/tinydb.py +++ b/app/storage/tinydb.py @@ -5,9 +5,10 @@ from tinydb import Query, TinyDB from app.constants import DATA_DIR -from app.context import GlobalSettings, StorageHandler +from app.db import StorageHandler from app.handlers import ContentRetrievalHandler, LLMHandler, NotificationHandler from app.models import EntryContent, Feed, FeedEntry +from app.settings import GlobalSettings logger = getLogger("uvicorn.error") @@ -126,40 +127,19 @@ def feed_entry_exists(self, id: str): else: return False - async def get_entry_content( - self, entry: FeedEntry, redrive: bool = False - ) -> EntryContent: + def retrieve_entry_content(self, entry: FeedEntry) -> EntryContent: table = self.db.table("entry_contents") query = Query().id.matches(entry.id) + existing = table.search(query)[0] - existing = table.search(query) - if existing and not redrive: - return EntryContent(**existing[0]["entry_contents"]) - - else: - if redrive: - self.logger.info(f"starting redrive for feed entry {entry.id}") - - settings = self.get_settings() + return EntryContent(**existing["entry_contents"]) - raw_content = await self.get_entry_html(entry.url, settings=settings) - content = self.get_main_content(content=raw_content) - - feed = self.get_feed(entry.feed_id) - - summary = self.summarize( - feed=feed, entry=entry, mk=content, settings=settings - ) - - entry_content = EntryContent( - url=entry.url, - content=content, - summary=summary if summary else None, - ) - - await self.upsert_entry_content(content=entry_content) + def entry_content_exists(self, entry: FeedEntry) -> bool: + table = self.db.table("entry_contents") + query = Query().id.matches(entry.id) + existing = table.search(query) - return entry_content + return bool(existing) async def upsert_entry_content(self, content: EntryContent): table = self.db.table("entry_contents") diff --git a/app/templates/feed_config.html b/app/templates/feed_config.html index d6771ee..5872da8 100644 --- a/app/templates/feed_config.html +++ b/app/templates/feed_config.html @@ -92,6 +92,11 @@