Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Atom Support #47

Merged
merged 19 commits into from
Feb 16, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 15 additions & 14 deletions rss_parser/_parser.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,44 @@
from typing import ClassVar, Optional, Type
from typing import Optional, Type

from xmltodict import parse

from rss_parser.models import XMLBaseModel
from rss_parser.models.atom import Atom
from rss_parser.models.rss import RSS

# >>> FUTURE
# TODO: May be support generator based approach for big rss feeds
# TODO: Add cli to parse to json
# TODO: Possibly bundle as deb/rpm/exe
# TODO: Atom support
# TODO: Older RSS versions?
# TODO: Older Atom versions
# TODO: Older RSS versions


class Parser:
"""Parser for rss files."""

schema: ClassVar[Type[XMLBaseModel]] = RSS
"""Parser for rss/atom files."""
ddkasa marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def _check_atom(root: dict):
def check_schema(root: dict) -> tuple[dict, type[XMLBaseModel]]:
if "feed" in root:
raise NotImplementedError("ATOM feed is not currently supported")
return root, Atom
return root["rss"], RSS

@staticmethod
def to_xml(data: str, *args, **kwargs):
return parse(str(data), *args, **kwargs)

@classmethod
def parse(cls, data: str, *, schema: Optional[Type[XMLBaseModel]] = None) -> XMLBaseModel:
def parse(cls, data: str, *, schema: Optional[Type[XMLBaseModel]] = None, root_key: str = "") -> XMLBaseModel:
"""
Parse XML data into schema (default: RSS 2.0).
Parse XML data into schema (default: RSS 2.0 or Atom).

:param data: string of XML data that needs to be parsed
:return: "schema" object
"""
root = cls.to_xml(data)
cls._check_atom(root)

schema = schema or cls.schema
if not isinstance(schema, XMLBaseModel):
root, schema = cls.check_schema(root)
else:
root = root.get(root_key, root)

return schema.parse_obj(root["rss"])
return schema.parse_obj(root)
3 changes: 3 additions & 0 deletions rss_parser/models/atom/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .atom import Atom

__all__ = ("Atom",)
15 changes: 15 additions & 0 deletions rss_parser/models/atom/atom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import Optional

from rss_parser.models import XMLBaseModel
from rss_parser.models.atom.feed import Feed
from rss_parser.models.types.tag import Tag
from rss_parser.pydantic_proxy import import_v1_pydantic

pydantic = import_v1_pydantic()


class Atom(XMLBaseModel):
"""Atom 1.0"""

version: Optional[Tag[str]] = pydantic.Field(alias="@version")
feed: Tag[Feed]
54 changes: 54 additions & 0 deletions rss_parser/models/atom/entry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from typing import Optional

from rss_parser.models import XMLBaseModel
from rss_parser.models.types.date import DateTimeOrStr
from rss_parser.models.types.tag import Tag
from rss_parser.pydantic_proxy import import_v1_pydantic

pydantic = import_v1_pydantic()


class RequiredAtomEntryMixin(XMLBaseModel):
entry_id: Tag[str] = pydantic.Field(alias="id")
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
"Identifier for the entry."

title: Tag[str]
"The title of the entry."

updated: Tag[str]
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
"Indicates when the entry was updated."


class RecommendedAtomEntryMixin(XMLBaseModel):
author: Optional[Tag[dict]] = None
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
"Email, name, and URI of the author of the entry."

link: Optional[Tag[list]] = None
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
"The URL of the entry."

content: Optional[Tag[dict]] = None
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
"The main content of the entry."

summary: Optional[Tag[str]] = None
"Conveys a short summary, abstract, or excerpt of the entry. Some feeds use this tag as the main content."


class OptionalAtomEntryMixin(XMLBaseModel):
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
category: Optional[Tag[dict]] = None
"Specifies a categories that the feed belongs to."

contributor: Optional[Tag[dict]] = None
"Email, name, and URI of the contributors of the entry."

rights: Optional[Tag[str]] = None
"The copyright of the entry."

published: Optional[Tag[DateTimeOrStr]] = None
"Indicates when the entry was published."

source: Optional[Tag[str]] = None
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
"Contains metadata from the source feed if this entry is a copy."


class Entry(RequiredAtomEntryMixin, RecommendedAtomEntryMixin, OptionalAtomEntryMixin, XMLBaseModel):
"""https://validator.w3.org/feed/docs/atom.html"""
61 changes: 61 additions & 0 deletions rss_parser/models/atom/feed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from typing import Optional

from rss_parser.models import XMLBaseModel
from rss_parser.models.atom.entry import Entry
from rss_parser.models.image import Image
from rss_parser.models.types.date import DateTimeOrStr
from rss_parser.models.types.only_list import OnlyList
from rss_parser.models.types.tag import Tag
from rss_parser.pydantic_proxy import import_v1_pydantic

pydantic = import_v1_pydantic()


class RequiredAtomFeedMixin(XMLBaseModel):
feed_id: Tag[str] = pydantic.Field(alias="id")
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
"Identifies the feed using a universally unique and permanent URI."

title: Tag[str]
"Contains a human readable title for the feed."

updated: Tag[DateTimeOrStr]
"Indicates the last time the feed was modified in a significant way."


class RecommendedAtomFeedMixin(XMLBaseModel):
author: Optional[Tag[str]] = None
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
"Names one author of the feed. A feed may have multiple author elements."

link: Optional[Tag[list]] = None
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
"The URL to the feed. A feed may have multiple link elements."


class OptionalAtomFeedMixin(XMLBaseModel):
entries: Optional[OnlyList[Tag[Entry]]] = pydantic.Field(alias="entry", default=[])
"The entries in the feed. A feed may have multiple entry elements."

category: Optional[Tag[str]] = None
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
"Specifies a categories that the feed belongs to. The feed may have multiple categories elements."

contributor: Optional[Tag[str]] = None
"Names one contributor to the feed. A feed may have multiple contributor elements."

generator: Optional[Tag[str]] = None
"Identifies the software used to generate the feed, for debugging and other purposes."

icon: Optional[Tag[Image]] = None
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
"Identifies a small image which provides iconic visual identification for the feed. Icons should be square."

logo: Optional[Tag[Image]] = None
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
"Identifies a larger image which provides visual identification for the feed.\
Images should be twice as wide as they are tall."

rights: Optional[Tag[str]] = None
"The copyright of the feed."

subtitle: Optional[Tag[str]] = None
"Contains a human readable description or subtitle for the feed."


class Feed(RequiredAtomFeedMixin, RecommendedAtomFeedMixin, OptionalAtomFeedMixin, XMLBaseModel):
"""https://validator.w3.org/feed/docs/atom.html"""
3 changes: 3 additions & 0 deletions rss_parser/models/rss/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .rss import RSS

__all__ = ("RSS",)
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from rss_parser.models import XMLBaseModel
from rss_parser.models.image import Image
from rss_parser.models.item import Item
from rss_parser.models.text_input import TextInput
from rss_parser.models.rss.item import Item
from rss_parser.models.rss.text_input import TextInput
from rss_parser.models.types.date import DateTimeOrStr
from rss_parser.models.types.only_list import OnlyList
from rss_parser.models.types.tag import Tag
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion rss_parser/models/rss.py → rss_parser/models/rss/rss.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Optional

from rss_parser.models import XMLBaseModel
from rss_parser.models.channel import Channel
from rss_parser.models.rss.channel import Channel
from rss_parser.models.types.tag import Tag
from rss_parser.pydantic_proxy import import_v1_pydantic

Expand Down
File renamed without changes.
8 changes: 4 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def sample_and_result(request):
return sample.read(), loads(result.read())


@pytest.fixture
def atom_feed():
with open(sample_dir / "atom.xml") as f:
return f.read()
# @pytest.fixture
ddkasa marked this conversation as resolved.
Show resolved Hide resolved
# def atom_feed():
# with open(sample_dir / "atom.xml") as f:
# return f.read()
134 changes: 134 additions & 0 deletions tests/samples/atom.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
{
"feed": {
"attributes": {},
"content": {
"author": null,
"category": null,
"contributor": null,
"entries": [
{
"attributes": {},
"content": {
"author": {
"attributes": {},
"content": {
"email": "[email protected]",
"name": "John Doe",
"uri": "http://example.org/"
}
},
"category": null,
"content": {
"attributes": {
"type": "xhtml",
"xml:base": "http://diveintomark.org/",
"xml:lang": "en"
},
"content": {
"@type": "xhtml",
"@xml:base": "http://diveintomark.org/",
"@xml:lang": "en",
"div": {
"p": {
"i": "[Update: The Atom draft is finished.]"
}
}
}
},
"contributor": {
"attributes": {},
"content": {
"name": "John Doe"
}
},
"entry_id": {
"attributes": {},
"content": "tag:example.org,2003:3.2397"
},
"link": {
"attributes": {},
"content": [
{
"@href": "http://example.org/2005/04/02/atom",
"@rel": "alternate",
"@type": "text/html"
},
{
"@href": "http://example.org/audio/ph34r_my_podcast.mp3",
"@length": "1337",
"@rel": "enclosure",
"@type": "audio/mpeg"
}
]
},
"published": {
"attributes": {},
"content": "2003-12-13 08:29:29-04:00"
},
"rights": null,
"source": null,
"summary": null,
"title": {
"attributes": {},
"content": "Atom draft-07 snapshot"
},
"updated": {
"attributes": {},
"content": "2005-07-31T12:29:29Z"
}
}
}
],
"feed_id": {
"attributes": {},
"content": "tag:example.org,2003:3"
},
"generator": {
"attributes": {
"uri": "http://www.example.com/",
"version": "1.0"
},
"content": "Example Toolkit"
},
"icon": null,
"link": {
"attributes": {},
"content": [
{
"@href": "http://example.org/",
"@hreflang": "en",
"@rel": "alternate",
"@type": "text/html"
},
{
"@href": "http://example.org/feed.atom",
"@rel": "self",
"@type": "application/atom+xml"
}
]
},
"logo": null,
"rights": {
"attributes": {},
"content": "Copyright (c) 2003, John Doe"
},
"subtitle": {
"attributes": {
"type": "html"
},
"content": "A <em>lot</em> of effort\n went into making this effortless"
},
"title": {
"attributes": {
"type": "text"
},
"content": "Title"
},
"updated": {
"attributes": {},
"content": "2005-07-31 12:29:29+00:00"
}
}
},
"version": null
}
10 changes: 5 additions & 5 deletions tests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

@pytest.mark.parametrize(
"sample_and_result",
[["rss_2"], ["rss_2_no_category_attr"], ["apology_line"], ["rss_2_with_1_item"]],
[["rss_2"], ["rss_2_no_category_attr"], ["apology_line"], ["rss_2_with_1_item"], ["atom"]],
indirect=True,
)
def test_parses_all_samples(sample_and_result):
Expand Down Expand Up @@ -38,7 +38,7 @@ def test_json_plain_ignores_attributes(sample_and_result):
assert left == right


def test_fails_atom_feed(atom_feed):
ddkasa marked this conversation as resolved.
Show resolved Hide resolved
# Expect ATOM feed to fail since it's not supported
with pytest.raises(NotImplementedError):
Parser.parse(atom_feed)
# def test_fails_atom_feed(atom_feed):
# # Expect ATOM feed to fail since it's not supported
# with pytest.raises(NotImplementedError):
# Parser.parse(atom_feed)
Loading