Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Atom Support #47

Merged
merged 19 commits into from
Feb 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ select = [
"**/__init__.py" = [
"F401"
]
"rss_parser/models/atom/**" = [
"A003"
]


[build-system]
Expand Down
4 changes: 3 additions & 1 deletion rss_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from ._parser import Parser
from ._parser import AtomParser, BaseParser, Parser, RSSParser

__all__ = ("BaseParser", "Parser", "AtomParser", "RSSParser")
62 changes: 46 additions & 16 deletions rss_parser/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,72 @@

from xmltodict import parse

from rss_parser.custom_decorators import abstract_class_attributes
from rss_parser.models import XMLBaseModel
from rss_parser.models.atom import Atom
from rss_parser.models.rss import RSS

# >>> FUTURE
# TODO: May be support generator based approach for big rss feeds
# TODO: Add cli to parse to json
# TODO: Possibly bundle as deb/rpm/exe
# TODO: Atom support
# TODO: Older RSS versions?
# TODO: Older Atom versions
# TODO: Older RSS versions


class Parser:
"""Parser for rss files."""
@abstract_class_attributes("schema")
class BaseParser:
"""Parser for rss/atom files."""

schema: ClassVar[Type[XMLBaseModel]] = RSS
ddkasa marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def _check_atom(root: dict):
if "feed" in root:
raise NotImplementedError("ATOM feed is not currently supported")
schema: ClassVar[Type[XMLBaseModel]]
root_key: Optional[str] = None
dhvcc marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def to_xml(data: str, *args, **kwargs):
return parse(str(data), *args, **kwargs)

@classmethod
def parse(cls, data: str, *, schema: Optional[Type[XMLBaseModel]] = None) -> XMLBaseModel:
def parse(
cls,
data: str,
*,
schema: Optional[Type[XMLBaseModel]] = None,
root_key: Optional[str] = None,
) -> XMLBaseModel:
"""
Parse XML data into schema (default: RSS 2.0).

Parse XML data into schema.
:param data: string of XML data that needs to be parsed
:return: "schema" object
"""
root = cls.to_xml(data)
cls._check_atom(root)

schema = schema or cls.schema
schema = schema if schema else cls.schema

root_key = root_key if root_key else cls.root_key

if root_key:
root = root.get(root_key, root)

return schema.parse_obj(root)


class AtomParser(BaseParser):
schema = Atom


class RSSParser(BaseParser):
root_key = "rss"
schema = RSS


class Parser(RSSParser):
@classmethod
def parse(cls, data: str, *, schema: Optional[Type[XMLBaseModel]] = None) -> XMLBaseModel:
import warnings

return schema.parse_obj(root["rss"])
warnings.warn(
"Class Parser was renamed to RSSParser " "and will be removed in the next major update",
DeprecationWarning,
stacklevel=2,
)
return RSSParser.parse(data, schema=schema)
43 changes: 43 additions & 0 deletions rss_parser/custom_decorators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
def abstract_class_attributes(*names):
"""Class decorator to add one or more abstract attribute."""

def _func(cls, *names):
"""Function that extends the __init_subclass__ method of a class."""

# Add each attribute to the class with the value of NotImplemented
for name in names:
setattr(cls, name, NotImplemented)

# Save the original __init_subclass__ implementation, then wrap
# it with our new implementation.
orig_init_subclass = cls.__init_subclass__

def new_init_subclass(cls, **kwargs):
"""
New definition of __init_subclass__ that checks that
attributes are implemented.
"""

# The default implementation of __init_subclass__ takes no
# positional arguments, but a custom implementation does.
# If the user has not reimplemented __init_subclass__ then
# the first signature will fail and we try the second.
try:
orig_init_subclass(cls, **kwargs)
except TypeError:
orig_init_subclass(**kwargs)

# Check that each attribute is defined.
for name in names:
if getattr(cls, name, NotImplemented) is NotImplemented:
raise NotImplementedError(f"Class attribute {name} must be set for class {cls}")

# Bind this new function to the __init_subclass__.
# For reasons beyond the scope here, it we must manually
# declare it as a classmethod because it is not done automatically
# as it would be if declared in the standard way.
cls.__init_subclass__ = classmethod(new_init_subclass)

return cls

return lambda cls: _func(cls, *names)
3 changes: 3 additions & 0 deletions rss_parser/models/atom/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .atom import Atom

__all__ = ("Atom",)
15 changes: 15 additions & 0 deletions rss_parser/models/atom/atom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import Optional

from rss_parser.models import XMLBaseModel
from rss_parser.models.atom.feed import Feed
from rss_parser.models.types.tag import Tag
from rss_parser.pydantic_proxy import import_v1_pydantic

pydantic = import_v1_pydantic()


class Atom(XMLBaseModel):
"""Atom 1.0"""

version: Optional[Tag[str]] = pydantic.Field(alias="@version")
feed: Tag[Feed]
56 changes: 56 additions & 0 deletions rss_parser/models/atom/entry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from typing import Optional

from rss_parser.models import XMLBaseModel
from rss_parser.models.atom.person import Person
from rss_parser.models.types.date import DateTimeOrStr
from rss_parser.models.types.only_list import OnlyList
from rss_parser.models.types.tag import Tag
from rss_parser.pydantic_proxy import import_v1_pydantic

pydantic = import_v1_pydantic()


class RequiredAtomEntryMixin(XMLBaseModel):
id: Tag[str]
"Identifier for the entry."

title: Tag[str]
"The title of the entry."

updated: Tag[DateTimeOrStr]
"Indicates when the entry was updated."


class RecommendedAtomEntryMixin(XMLBaseModel):
authors: Optional[OnlyList[Tag[Person]]] = pydantic.Field(alias="author", default=[])
"Entry authors."

links: Optional[OnlyList[Tag[str]]] = pydantic.Field(alias="link", default=[])
"The URL of the entry."

content: Optional[Tag[str]] = None
"The main content of the entry."

summary: Optional[Tag[str]] = None
"Conveys a short summary, abstract, or excerpt of the entry. Some feeds use this tag as the main content."


class OptionalAtomEntryMixin(XMLBaseModel):
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
categories: Optional[OnlyList[Tag[dict]]] = pydantic.Field(alias="category", default=[])
"Specifies a categories that the entry belongs to."

contributors: Optional[OnlyList[Tag[Person]]] = pydantic.Field(alias="contributor", default=[])
"Entry contributors."

rights: Optional[Tag[str]] = None
"The copyright of the entry."

published: Optional[Tag[DateTimeOrStr]] = None
"Indicates when the entry was published."

source: Optional[Tag[str]] = None
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
"Contains metadata from the source feed if this entry is a copy."


class Entry(RequiredAtomEntryMixin, RecommendedAtomEntryMixin, OptionalAtomEntryMixin, XMLBaseModel):
"""https://validator.w3.org/feed/docs/atom.html"""
61 changes: 61 additions & 0 deletions rss_parser/models/atom/feed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from typing import Optional

from rss_parser.models import XMLBaseModel
from rss_parser.models.atom.entry import Entry
from rss_parser.models.atom.person import Person
from rss_parser.models.types.date import DateTimeOrStr
from rss_parser.models.types.only_list import OnlyList
from rss_parser.models.types.tag import Tag
from rss_parser.pydantic_proxy import import_v1_pydantic

pydantic = import_v1_pydantic()


class RequiredAtomFeedMixin(XMLBaseModel):
id: Tag[str]
"Identifies the feed using a universally unique and permanent URI."

title: Tag[str]
"Contains a human readable title for the feed."

updated: Tag[DateTimeOrStr]
"Indicates the last time the feed was modified in a significant way."


class RecommendedAtomFeedMixin(XMLBaseModel):
authors: Optional[OnlyList[Tag[Person]]] = pydantic.Field(alias="author", default=[])
"Names one author of the feed. A feed may have multiple author elements."

links: Optional[OnlyList[Tag[str]]] = pydantic.Field(alias="link", default=[])
"The URL to the feed. A feed may have multiple link elements."


class OptionalAtomFeedMixin(XMLBaseModel):
entries: Optional[OnlyList[Tag[Entry]]] = pydantic.Field(alias="entry", default=[])
"The entries in the feed. A feed may have multiple entry elements."

categories: Optional[OnlyList[Tag[dict]]] = pydantic.Field(alias="category", default=[])
"Specifies a categories that the feed belongs to. The feed may have multiple categories elements."

contributors: Optional[OnlyList[Tag[Person]]] = pydantic.Field(alias="contributor", default=[])
"Feed contributors."

generator: Optional[Tag[str]] = None
"Identifies the software used to generate the feed, for debugging and other purposes."

icon: Optional[Tag[str]] = None
"Identifies a small image which provides iconic visual identification for the feed. Icons should be square."

logo: Optional[Tag[str]] = None
"Identifies a larger image which provides visual identification for the feed. \
Images should be twice as wide as they are tall."

rights: Optional[Tag[str]] = None
"The copyright of the feed."

subtitle: Optional[Tag[str]] = None
"Contains a human readable description or subtitle for the feed."


class Feed(RequiredAtomFeedMixin, RecommendedAtomFeedMixin, OptionalAtomFeedMixin, XMLBaseModel):
"""https://validator.w3.org/feed/docs/atom.html"""
18 changes: 18 additions & 0 deletions rss_parser/models/atom/person.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from typing import Optional

from rss_parser.models import XMLBaseModel
from rss_parser.models.types.tag import Tag
from rss_parser.pydantic_proxy import import_v1_pydantic

pydantic = import_v1_pydantic()


class Person(XMLBaseModel):
name: Tag[str]
"Conveys a human-readable name for the person."

uri: Optional[Tag[str]] = None
"Contains a home page for the person."

email: Optional[Tag[str]] = None
"Contains an email address for the person."
19 changes: 19 additions & 0 deletions rss_parser/models/atom/source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import Optional

from rss_parser.models import XMLBaseModel
from rss_parser.models.types.date import DateTimeOrStr
from rss_parser.models.types.tag import Tag
from rss_parser.pydantic_proxy import import_v1_pydantic

pydantic = import_v1_pydantic()


class Source(XMLBaseModel):
id: Optional[Tag[str]] = None
"Source id."

title: Optional[Tag[str]] = None
"Title of the source."

updated: Optional[Tag[DateTimeOrStr]] = None
"When source was updated."
3 changes: 3 additions & 0 deletions rss_parser/models/rss/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .rss import RSS

__all__ = ("RSS",)
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from typing import Optional

from rss_parser.models import XMLBaseModel
from rss_parser.models.image import Image
from rss_parser.models.item import Item
from rss_parser.models.text_input import TextInput
from rss_parser.models.rss.image import Image
from rss_parser.models.rss.item import Item
from rss_parser.models.rss.text_input import TextInput
from rss_parser.models.types.date import DateTimeOrStr
from rss_parser.models.types.only_list import OnlyList
from rss_parser.models.types.tag import Tag
Expand Down
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion rss_parser/models/rss.py → rss_parser/models/rss/rss.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Optional

from rss_parser.models import XMLBaseModel
from rss_parser.models.channel import Channel
from rss_parser.models.rss.channel import Channel
from rss_parser.models.types.tag import Tag
from rss_parser.pydantic_proxy import import_v1_pydantic

Expand Down
File renamed without changes.
15 changes: 7 additions & 8 deletions rss_parser/models/types/date.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from datetime import datetime
from email.utils import parsedate_to_datetime

from rss_parser.pydantic_proxy import import_v1_pydantic

pydantic_validators = import_v1_pydantic(".validators")


class DateTimeOrStr(datetime):
@classmethod
Expand All @@ -25,16 +29,11 @@ def validate_dt_or_str(value: str) -> datetime:
# Try to parse standard (RFC 822)
try:
return parsedate_to_datetime(value)
except ValueError:
pass
# Try ISO
try:
return datetime.fromisoformat(value)
except ValueError:
except (ValueError, TypeError): # https://github.com/python/cpython/issues/74866
pass
# Try timestamp
# Try ISO or timestamp
try:
return datetime.fromtimestamp(int(value))
return pydantic_validators.parse_datetime(value)
except ValueError:
pass

Expand Down
2 changes: 1 addition & 1 deletion rss_parser/models/types/tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class Tag(pydantic_generics.GenericModel, Generic[T]):
>>> m.width.content
48
>>> type(m.width), type(m.width.content)
(<class 'rss_parser.models.image.Tag[int]'>, <class 'int'>)
(<class 'rss_parser.models.rss.image.Tag[int]'>, <class 'int'>)
>>> # The attributes are empty by default
>>> m.width.attributes
{}
Expand Down
Loading
Loading