Skip to content

Commit

Permalink
Merge pull request #69 from Gallaecio/product-list-product-extractor
Browse files Browse the repository at this point in the history
Provide extractors
  • Loading branch information
kmike authored Jan 24, 2024
2 parents 347b851 + 8d57a99 commit 8c5d3ca
Show file tree
Hide file tree
Showing 8 changed files with 222 additions and 3 deletions.
22 changes: 22 additions & 0 deletions docs/reference/extractors.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
.. _extractor-api:

=============
Extractor API
=============

API reference of provided :ref:`extractors <extractors>`.

Product from list
=================

.. autoclass:: zyte_common_items.ProductFromListExtractor

.. autoclass:: zyte_common_items.ProductFromListSelectorExtractor


Product variant
===============

.. autoclass:: zyte_common_items.ProductVariantExtractor

.. autoclass:: zyte_common_items.ProductVariantSelectorExtractor
1 change: 1 addition & 0 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ Reference

items
pages
extractors
components
adapter
22 changes: 22 additions & 0 deletions docs/usage/pages.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,28 @@ extra fields. For example:
async def foo(self):
return "bar"
.. _extractors:

Extractors
==========

For some nested fields (:class:`~.ProductFromList`, :class:`~.ProductVariant`),
:ref:`base extractors <default-processors-nested>` exist that you can subclass
to write your own extractors.

They provide the following base line:

- They declare the :ref:`item class <items>` that they return, allowing for
their ``to_item`` method to automatically build an instance of it from
``@field``-decorated methods. See :ref:`fields`.

- They also provide default :ref:`processors <processors>` for some
item-specific fields.

See :ref:`extractor-api`.

.. _processors:

Field processors
================

Expand Down
121 changes: 121 additions & 0 deletions tests/test_extractors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import attrs
import pytest
from parsel import Selector
from web_poet import field

from zyte_common_items import (
ProductFromList,
ProductFromListExtractor,
ProductFromListSelectorExtractor,
ProductVariant,
ProductVariantExtractor,
ProductVariantSelectorExtractor,
)

from .test_processors import gtin_expected, gtin_str


@pytest.mark.asyncio
async def test_product_from_list_extractor():
@attrs.define
class MyProductFromListExtractor(ProductFromListExtractor):
selector: Selector

@field
def price(self):
return self.selector.css("price")

@field
def regularPrice(self):
return self.selector.css("oldPrice")

selector = Selector("<data><price>10€</price><oldPrice>20€</oldPrice></data>")
extracted = await MyProductFromListExtractor(selector).to_item()

assert isinstance(extracted, ProductFromList)
assert extracted.price == "10.00"
assert extracted.regularPrice == "20.00"


@pytest.mark.asyncio
async def test_product_from_list_selector_extractor():
class MyProductFromListSelectorExtractor(ProductFromListSelectorExtractor):
@field
def price(self):
return self.css("price")

@field
def regularPrice(self):
return self.css("oldPrice")

selector = Selector("<data><price>10€</price><oldPrice>20€</oldPrice></data>")
extracted = await MyProductFromListSelectorExtractor(selector).to_item()

assert isinstance(extracted, ProductFromList)
assert extracted.price == "10.00"
assert extracted.regularPrice == "20.00"


@pytest.mark.asyncio
async def test_product_variant_extractor():
@attrs.define
class MyProductVariantExtractor(ProductVariantExtractor):
selector: Selector

@field
def gtin(self):
return self.selector.css("gtin")

@field
def price(self):
return self.selector.css("price")

@field
def regularPrice(self):
return self.selector.css("oldPrice")

selector = Selector(
f"<data>"
f"<price>10€</price>"
f"<oldPrice>20€</oldPrice>"
f"<unusedField>foo</unusedField>"
f"<gtin>{gtin_str}</gtin>"
f"</data>"
)
extracted = await MyProductVariantExtractor(selector).to_item()

assert isinstance(extracted, ProductVariant)
assert extracted.gtin == gtin_expected
assert extracted.price == "10.00"
assert extracted.regularPrice == "20.00"


@pytest.mark.asyncio
async def test_product_variant_selector_extractor():
class MyProductVariantSelectorExtractor(ProductVariantSelectorExtractor):
@field
def gtin(self):
return self.css("gtin")

@field
def price(self):
return self.css("price")

@field
def regularPrice(self):
return self.css("oldPrice")

selector = Selector(
f"<data>"
f"<price>10€</price>"
f"<oldPrice>20€</oldPrice>"
f"<unusedField>foo</unusedField>"
f"<gtin>{gtin_str}</gtin>"
f"</data>"
)
extracted = await MyProductVariantSelectorExtractor(selector).to_item()

assert isinstance(extracted, ProductVariant)
assert extracted.gtin == gtin_expected
assert extracted.price == "10.00"
assert extracted.regularPrice == "20.00"
4 changes: 3 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ setenv =
READTHEDOCS_PROJECT=zyte-common-items
READTHEDOCS_VERSION=main
commands =
sphinx-build -nW -b html . {envtmpdir}/html
sphinx-build -n -b html . {envtmpdir}/html
# Temporarily removed -W due to
# https://github.com/scrapinghub/web-poet/pull/194

[testenv:mypy]
basepython = python3.11
Expand Down
6 changes: 6 additions & 0 deletions zyte_common_items/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@
StarRating,
Video,
)
from .extractors import (
ProductFromListExtractor,
ProductFromListSelectorExtractor,
ProductVariantExtractor,
ProductVariantSelectorExtractor,
)
from .items import (
Article,
ArticleFromList,
Expand Down
43 changes: 43 additions & 0 deletions zyte_common_items/extractors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from web_poet import Extractor, SelectorExtractor

from .items import ProductFromList, ProductVariant
from .processors import gtin_processor, price_processor, simple_price_processor


class _ProductProcessors:
price = [price_processor]
regularPrice = [simple_price_processor]


class ProductFromListExtractor(Extractor[ProductFromList]):
""":class:`~web_poet.pages.Extractor` for :class:`ProductFromList`."""

class Processors(_ProductProcessors):
pass


class ProductFromListSelectorExtractor(SelectorExtractor[ProductFromList]):
""":class:`~web_poet.pages.SelectorExtractor` for
:class:`ProductFromList`."""

class Processors(_ProductProcessors):
pass


class _ProductVariantProcessors(_ProductProcessors):
gtin = [gtin_processor]


class ProductVariantExtractor(Extractor[ProductVariant]):
""":class:`~web_poet.pages.Extractor` for :class:`ProductVariant`."""

class Processors(_ProductVariantProcessors):
pass


class ProductVariantSelectorExtractor(SelectorExtractor[ProductVariant]):
""":class:`~web_poet.pages.SelectorExtractor` for
:class:`ProductVariant`."""

class Processors(_ProductVariantProcessors):
pass
6 changes: 4 additions & 2 deletions zyte_common_items/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,8 @@ class ArticleList(Item):
class ProductVariant(Item):
""":class:`Product` variant.
See :attr:`Product.variants`.
See :attr:`Product.variants`, :class:`ProductVariantExtractor`,
:class:`ProductVariantSelectorExtractor`.
"""

#: List of name-value pais of data about a specific, otherwise unmapped
Expand Down Expand Up @@ -634,7 +635,8 @@ class ProductFromList(Item):
"""Product from a product list from a product listing page of an e-commerce
webpage.
See :class:`ProductList`.
See :class:`ProductList`, :class:`ProductFromListExtractor`,
:class:`ProductFromListSelectorExtractor`.
"""

#: Price currency `ISO 4217`_ alphabetic code (e.g. ``"USD"``).
Expand Down

0 comments on commit 8c5d3ca

Please sign in to comment.