From c2f15e8947183ca4d1f5581bda72260da4894f7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 6 Feb 2024 11:58:15 +0100 Subject: [PATCH] Initial draft --- docs/reference/index.rst | 20 +-- docs/templates/e-commerce.rst | 6 - setup.py | 2 +- tests/test_ecommerce.py | 14 +- zyte_spider_templates/__init__.py | 3 +- zyte_spider_templates/params.py | 147 +++++++++++++++++++++ zyte_spider_templates/spiders/base.py | 44 +----- zyte_spider_templates/spiders/ecommerce.py | 96 +++----------- 8 files changed, 178 insertions(+), 154 deletions(-) create mode 100644 zyte_spider_templates/params.py diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 4fbf2c7..913df2a 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -5,13 +5,16 @@ Reference Base classes ============ -.. autopydantic_model:: zyte_spider_templates.spiders.base.BaseSpiderParams +.. autoclass:: zyte_spider_templates.spiders.base.BaseSpider + +.. autopydantic_model:: zyte_spider_templates.params.AllParams :inherited-members: BaseModel -.. autoclass:: zyte_spider_templates.spiders.base.BaseSpider +.. autoenum:: zyte_spider_templates.params.CrawlStrategy -.. autoenum:: zyte_spider_templates.spiders.base.Geolocation - :noindex: +.. autoenum:: zyte_spider_templates.params.ExtractFrom + +.. autoenum:: zyte_spider_templates.params.Geolocation E-commerce ========== @@ -20,15 +23,6 @@ E-commerce :noindex: :inherited-members: BaseModel -.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy - :noindex: - -.. autoclass:: zyte_spider_templates.spiders.ecommerce.ExtractFrom - :noindex: - -.. autoclass:: zyte_spider_templates.spiders.ecommerce.EcommerceSpider - :noindex: - Pages ===== diff --git a/docs/templates/e-commerce.rst b/docs/templates/e-commerce.rst index e2a8684..efa5475 100644 --- a/docs/templates/e-commerce.rst +++ b/docs/templates/e-commerce.rst @@ -16,9 +16,3 @@ Parameters .. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams :inherited-members: BaseModel - -.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy - -.. autoenum:: zyte_spider_templates.spiders.ecommerce.ExtractFrom - -.. autoenum:: zyte_spider_templates.spiders.base.Geolocation diff --git a/setup.py b/setup.py index ee39b27..9cdfb89 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ "pydantic>=2", "scrapy>=2.11.0", "scrapy-poet>=0.20.1", - "scrapy-spider-metadata>=0.1.2", + "scrapy-spider-metadata @ git+https://github.com/Gallaecio/scrapy-spider-metadata.git@param-inheritance", "scrapy-zyte-api[provider]>=0.15.0", "zyte-common-items>=0.13.0", ], diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 41edff9..2ec7154 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -9,16 +9,13 @@ from scrapy_spider_metadata import get_spider_metadata from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request -from zyte_spider_templates import BaseSpiderParams from zyte_spider_templates._geolocations import ( GEOLOCATION_OPTIONS, GEOLOCATION_OPTIONS_WITH_CODE, Geolocation, ) -from zyte_spider_templates.spiders.ecommerce import ( - EcommerceCrawlStrategy, - EcommerceSpider, -) +from zyte_spider_templates.params import AllParams, CrawlStrategy +from zyte_spider_templates.spiders.ecommerce import EcommerceSpider from . import get_crawler from .test_utils import URL_TO_DOMAIN @@ -29,9 +26,7 @@ def test_parameters(): EcommerceSpider() EcommerceSpider(url="https://example.com") - EcommerceSpider( - url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.full - ) + EcommerceSpider(url="https://example.com", crawl_strategy=CrawlStrategy.full) EcommerceSpider(url="https://example.com", crawl_strategy="full") with pytest.raises(ValidationError): @@ -354,6 +349,7 @@ def test_metadata(): "title": "E-commerce", "description": "Template for spiders that extract product data from e-commerce websites.", "param_schema": { + "additionalProperties": False, "properties": { "crawl_strategy": { "default": "navigation", @@ -494,7 +490,7 @@ def test_metadata(): ], ) def test_validation_url(url, valid): - url_re = BaseSpiderParams.model_fields["url"].metadata[0].pattern + url_re = AllParams.model_fields["url"].metadata[0].pattern assert bool(re.match(url_re, url)) == valid diff --git a/zyte_spider_templates/__init__.py b/zyte_spider_templates/__init__.py index e3de8c9..fd223db 100644 --- a/zyte_spider_templates/__init__.py +++ b/zyte_spider_templates/__init__.py @@ -1,2 +1,3 @@ -from .spiders.base import BaseSpider, BaseSpiderParams +from .params import make_params +from .spiders.base import BaseSpider from .spiders.ecommerce import EcommerceSpider diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py new file mode 100644 index 0000000..cbf7372 --- /dev/null +++ b/zyte_spider_templates/params.py @@ -0,0 +1,147 @@ +from enum import Enum +from typing import Optional + +from pydantic import BaseModel, ConfigDict, Field, create_model +from pydantic.fields import PydanticUndefined + +from zyte_spider_templates._geolocations import ( + GEOLOCATION_OPTIONS_WITH_CODE, + Geolocation, +) +from zyte_spider_templates.documentation import document_enum + + +@document_enum +class CrawlStrategy(str, Enum): + full: str = "full" + """Follow most links within the domain of URL in an attempt to discover and + extract as many products as possible.""" + + navigation: str = "navigation" + """Follow pagination, subcategories, and product detail pages.""" + + pagination_only: str = "pagination_only" + """Follow pagination and product detail pages. SubCategory links are + ignored. Use this when some subCategory links are misidentified by + ML-extraction.""" + + +@document_enum +class ExtractFrom(str, Enum): + httpResponseBody: str = "httpResponseBody" + """Use HTTP responses. Cost-efficient and fast extraction method, which + works well on many websites.""" + + browserHtml: str = "browserHtml" + """Use browser rendering. Often provides the best quality.""" + + +class AllParams(BaseModel): + url: str = Field( + title="URL", + description="Initial URL for the crawl. Enter the full URL including http(s), " + "you can copy and paste it from your browser. Example: https://toscrape.com/", + pattern=r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$", + ) + geolocation: Optional[Geolocation] = Field( + title="Geolocation", + description="ISO 3166-1 alpha-2 2-character string specified in " + "https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.", + default=None, + json_schema_extra={ + "enumMeta": { + code: { + "title": GEOLOCATION_OPTIONS_WITH_CODE[code], + } + for code in Geolocation + } + }, + ) + max_requests: Optional[int] = Field( + description=( + "The maximum number of Zyte API requests allowed for the crawl.\n" + "\n" + "Requests with error responses that cannot be retried or exceed " + "their retry limit also count here, but they incur in no costs " + "and do not increase the request count in Scrapy Cloud." + ), + default=100, + json_schema_extra={ + "widget": "request-limit", + }, + ) + crawl_strategy: CrawlStrategy = Field( + title="Crawl strategy", + description="Determines how the start URL and follow-up URLs are crawled.", + default=CrawlStrategy.navigation, + json_schema_extra={ + "enumMeta": { + CrawlStrategy.full: { + "title": "Full", + "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.", + }, + CrawlStrategy.navigation: { + "title": "Navigation", + "description": "Follow pagination, subcategories, and product detail pages.", + }, + CrawlStrategy.pagination_only: { + "title": "Pagination Only", + "description": ( + "Follow pagination and product detail pages. SubCategory links are ignored. " + "Use this when some subCategory links are misidentified by ML-extraction." + ), + }, + }, + }, + ) + extract_from: Optional[ExtractFrom] = Field( + title="Extraction source", + description=( + "Whether to perform extraction using a browser request " + "(browserHtml) or an HTTP request (httpResponseBody)." + ), + default=None, + json_schema_extra={ + "enumMeta": { + ExtractFrom.browserHtml: { + "title": "browserHtml", + "description": "Use browser rendering. Often provides the best quality.", + }, + ExtractFrom.httpResponseBody: { + "title": "httpResponseBody", + "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.", + }, + }, + }, + ) + + +def make_params( + cls_name, + params, + *, + default=None, + required=None, + set_args=None, +): + fields = {} + default = default or {} + required = set(required) if required else set() + for param in params: + field = AllParams.model_fields[param] + if field in required: + field.default = PydanticUndefined + else: + try: + field.default = default[param] + except KeyError: + pass + fields[param] = (field.annotation, field) + model = create_model( + cls_name, + __config__=ConfigDict(extra="forbid"), + **fields, + ) + if set_args: + model.set_args = set_args + return model diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py index e64faa0..0cb9f9c 100644 --- a/zyte_spider_templates/spiders/base.py +++ b/zyte_spider_templates/spiders/base.py @@ -1,55 +1,13 @@ from importlib.metadata import version -from typing import Any, Dict, Optional +from typing import Any, Dict import scrapy -from pydantic import BaseModel, Field from scrapy.crawler import Crawler -from zyte_spider_templates._geolocations import ( - GEOLOCATION_OPTIONS_WITH_CODE, - Geolocation, -) - # Higher priority than command-line-defined settings (40). ARG_SETTING_PRIORITY: int = 50 -class BaseSpiderParams(BaseModel): - url: str = Field( - title="URL", - description="Initial URL for the crawl. Enter the full URL including http(s), " - "you can copy and paste it from your browser. Example: https://toscrape.com/", - pattern=r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$", - ) - geolocation: Optional[Geolocation] = Field( - title="Geolocation", - description="ISO 3166-1 alpha-2 2-character string specified in " - "https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.", - default=None, - json_schema_extra={ - "enumMeta": { - code: { - "title": GEOLOCATION_OPTIONS_WITH_CODE[code], - } - for code in Geolocation - } - }, - ) - max_requests: Optional[int] = Field( - description=( - "The maximum number of Zyte API requests allowed for the crawl.\n" - "\n" - "Requests with error responses that cannot be retried or exceed " - "their retry limit also count here, but they incur in no costs " - "and do not increase the request count in Scrapy Cloud." - ), - default=100, - json_schema_extra={ - "widget": "request-limit", - }, - ) - - class BaseSpider(scrapy.Spider): custom_settings: Dict[str, Any] = { "ZYTE_API_TRANSPARENT_MODE": True, diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index 4de10ea..ee298d5 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -1,93 +1,27 @@ -from enum import Enum from typing import Any, Callable, Dict, Iterable, Optional, Union import scrapy -from pydantic import Field from scrapy import Request from scrapy.crawler import Crawler from scrapy_poet import DummyResponse from scrapy_spider_metadata import Args from zyte_common_items import ProbabilityRequest, Product, ProductNavigation -from zyte_spider_templates.documentation import document_enum -from zyte_spider_templates.spiders.base import ( - ARG_SETTING_PRIORITY, - BaseSpider, - BaseSpiderParams, -) +from zyte_spider_templates import BaseSpider, make_params +from zyte_spider_templates.params import CrawlStrategy +from zyte_spider_templates.spiders.base import ARG_SETTING_PRIORITY from zyte_spider_templates.utils import get_domain - -@document_enum -class EcommerceCrawlStrategy(str, Enum): - full: str = "full" - """Follow most links within the domain of URL in an attempt to discover and - extract as many products as possible.""" - - navigation: str = "navigation" - """Follow pagination, subcategories, and product detail pages.""" - - pagination_only: str = "pagination_only" - """Follow pagination and product detail pages. SubCategory links are - ignored. Use this when some subCategory links are misidentified by - ML-extraction.""" - - -@document_enum -class ExtractFrom(str, Enum): - httpResponseBody: str = "httpResponseBody" - """Use HTTP responses. Cost-efficient and fast extraction method, which - works well on many websites.""" - - browserHtml: str = "browserHtml" - """Use browser rendering. Often provides the best quality.""" - - -class EcommerceSpiderParams(BaseSpiderParams): - crawl_strategy: EcommerceCrawlStrategy = Field( - title="Crawl strategy", - description="Determines how the start URL and follow-up URLs are crawled.", - default=EcommerceCrawlStrategy.navigation, - json_schema_extra={ - "enumMeta": { - EcommerceCrawlStrategy.full: { - "title": "Full", - "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.", - }, - EcommerceCrawlStrategy.navigation: { - "title": "Navigation", - "description": "Follow pagination, subcategories, and product detail pages.", - }, - EcommerceCrawlStrategy.pagination_only: { - "title": "Pagination Only", - "description": ( - "Follow pagination and product detail pages. SubCategory links are ignored. " - "Use this when some subCategory links are misidentified by ML-extraction." - ), - }, - }, - }, - ) - extract_from: Optional[ExtractFrom] = Field( - title="Extraction source", - description=( - "Whether to perform extraction using a browser request " - "(browserHtml) or an HTTP request (httpResponseBody)." - ), - default=None, - json_schema_extra={ - "enumMeta": { - ExtractFrom.browserHtml: { - "title": "browserHtml", - "description": "Use browser rendering. Often provides the best quality.", - }, - ExtractFrom.httpResponseBody: { - "title": "httpResponseBody", - "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.", - }, - }, - }, - ) +EcommerceSpiderParams = make_params( + "EcommerceSpiderParams", + [ + "url", + "geolocation", + "max_requests", + "crawl_strategy", + "extract_from", + ], +) class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider): @@ -144,7 +78,7 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: def start_requests(self) -> Iterable[Request]: page_params = {} - if self.args.crawl_strategy == EcommerceCrawlStrategy.full: + if self.args.crawl_strategy == CrawlStrategy.full: page_params = {"full_domain": self.allowed_domains[0]} yield Request( @@ -174,7 +108,7 @@ def parse_navigation( else: yield self.get_nextpage_request(navigation.nextPage) - if self.args.crawl_strategy != EcommerceCrawlStrategy.pagination_only: + if self.args.crawl_strategy != CrawlStrategy.pagination_only: for request in navigation.subCategories or []: yield self.get_subcategory_request(request, page_params=page_params)