From c2f15e8947183ca4d1f5581bda72260da4894f7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Tue, 6 Feb 2024 11:58:15 +0100
Subject: [PATCH] Initial draft

---
 docs/reference/index.rst                   |  20 +--
 docs/templates/e-commerce.rst              |   6 -
 setup.py                                   |   2 +-
 tests/test_ecommerce.py                    |  14 +-
 zyte_spider_templates/__init__.py          |   3 +-
 zyte_spider_templates/params.py            | 147 +++++++++++++++++++++
 zyte_spider_templates/spiders/base.py      |  44 +-----
 zyte_spider_templates/spiders/ecommerce.py |  96 +++-----------
 8 files changed, 178 insertions(+), 154 deletions(-)
 create mode 100644 zyte_spider_templates/params.py

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 4fbf2c7..913df2a 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -5,13 +5,16 @@ Reference
 Base classes
 ============
 
-.. autopydantic_model:: zyte_spider_templates.spiders.base.BaseSpiderParams
+.. autoclass:: zyte_spider_templates.spiders.base.BaseSpider
+
+.. autopydantic_model:: zyte_spider_templates.params.AllParams
     :inherited-members: BaseModel
 
-.. autoclass:: zyte_spider_templates.spiders.base.BaseSpider
+.. autoenum:: zyte_spider_templates.params.CrawlStrategy
 
-.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
-    :noindex:
+.. autoenum:: zyte_spider_templates.params.ExtractFrom
+
+.. autoenum:: zyte_spider_templates.params.Geolocation
 
 E-commerce
 ==========
@@ -20,15 +23,6 @@ E-commerce
     :noindex:
     :inherited-members: BaseModel
 
-.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
-    :noindex:
-
-.. autoclass:: zyte_spider_templates.spiders.ecommerce.ExtractFrom
-    :noindex:
-
-.. autoclass:: zyte_spider_templates.spiders.ecommerce.EcommerceSpider
-    :noindex:
-
 Pages
 =====
 
diff --git a/docs/templates/e-commerce.rst b/docs/templates/e-commerce.rst
index e2a8684..efa5475 100644
--- a/docs/templates/e-commerce.rst
+++ b/docs/templates/e-commerce.rst
@@ -16,9 +16,3 @@ Parameters
 
 .. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams
     :inherited-members: BaseModel
-
-.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
-
-.. autoenum:: zyte_spider_templates.spiders.ecommerce.ExtractFrom
-
-.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
diff --git a/setup.py b/setup.py
index ee39b27..9cdfb89 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
         "pydantic>=2",
         "scrapy>=2.11.0",
         "scrapy-poet>=0.20.1",
-        "scrapy-spider-metadata>=0.1.2",
+        "scrapy-spider-metadata @ git+https://github.com/Gallaecio/scrapy-spider-metadata.git@param-inheritance",
         "scrapy-zyte-api[provider]>=0.15.0",
         "zyte-common-items>=0.13.0",
     ],
diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py
index 41edff9..2ec7154 100644
--- a/tests/test_ecommerce.py
+++ b/tests/test_ecommerce.py
@@ -9,16 +9,13 @@
 from scrapy_spider_metadata import get_spider_metadata
 from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request
 
-from zyte_spider_templates import BaseSpiderParams
 from zyte_spider_templates._geolocations import (
     GEOLOCATION_OPTIONS,
     GEOLOCATION_OPTIONS_WITH_CODE,
     Geolocation,
 )
-from zyte_spider_templates.spiders.ecommerce import (
-    EcommerceCrawlStrategy,
-    EcommerceSpider,
-)
+from zyte_spider_templates.params import AllParams, CrawlStrategy
+from zyte_spider_templates.spiders.ecommerce import EcommerceSpider
 
 from . import get_crawler
 from .test_utils import URL_TO_DOMAIN
@@ -29,9 +26,7 @@ def test_parameters():
         EcommerceSpider()
 
     EcommerceSpider(url="https://example.com")
-    EcommerceSpider(
-        url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.full
-    )
+    EcommerceSpider(url="https://example.com", crawl_strategy=CrawlStrategy.full)
     EcommerceSpider(url="https://example.com", crawl_strategy="full")
 
     with pytest.raises(ValidationError):
@@ -354,6 +349,7 @@ def test_metadata():
         "title": "E-commerce",
         "description": "Template for spiders that extract product data from e-commerce websites.",
         "param_schema": {
+            "additionalProperties": False,
             "properties": {
                 "crawl_strategy": {
                     "default": "navigation",
@@ -494,7 +490,7 @@ def test_metadata():
     ],
 )
 def test_validation_url(url, valid):
-    url_re = BaseSpiderParams.model_fields["url"].metadata[0].pattern
+    url_re = AllParams.model_fields["url"].metadata[0].pattern
     assert bool(re.match(url_re, url)) == valid
 
 
diff --git a/zyte_spider_templates/__init__.py b/zyte_spider_templates/__init__.py
index e3de8c9..fd223db 100644
--- a/zyte_spider_templates/__init__.py
+++ b/zyte_spider_templates/__init__.py
@@ -1,2 +1,3 @@
-from .spiders.base import BaseSpider, BaseSpiderParams
+from .params import make_params
+from .spiders.base import BaseSpider
 from .spiders.ecommerce import EcommerceSpider
diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py
new file mode 100644
index 0000000..cbf7372
--- /dev/null
+++ b/zyte_spider_templates/params.py
@@ -0,0 +1,147 @@
+from enum import Enum
+from typing import Optional
+
+from pydantic import BaseModel, ConfigDict, Field, create_model
+from pydantic.fields import PydanticUndefined
+
+from zyte_spider_templates._geolocations import (
+    GEOLOCATION_OPTIONS_WITH_CODE,
+    Geolocation,
+)
+from zyte_spider_templates.documentation import document_enum
+
+
+@document_enum
+class CrawlStrategy(str, Enum):
+    full: str = "full"
+    """Follow most links within the domain of URL in an attempt to discover and
+    extract as many products as possible."""
+
+    navigation: str = "navigation"
+    """Follow pagination, subcategories, and product detail pages."""
+
+    pagination_only: str = "pagination_only"
+    """Follow pagination and product detail pages. SubCategory links are
+    ignored. Use this when some subCategory links are misidentified by
+    ML-extraction."""
+
+
+@document_enum
+class ExtractFrom(str, Enum):
+    httpResponseBody: str = "httpResponseBody"
+    """Use HTTP responses. Cost-efficient and fast extraction method, which
+    works well on many websites."""
+
+    browserHtml: str = "browserHtml"
+    """Use browser rendering. Often provides the best quality."""
+
+
+class AllParams(BaseModel):
+    url: str = Field(
+        title="URL",
+        description="Initial URL for the crawl. Enter the full URL including http(s), "
+        "you can copy and paste it from your browser. Example: https://toscrape.com/",
+        pattern=r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
+    )
+    geolocation: Optional[Geolocation] = Field(
+        title="Geolocation",
+        description="ISO 3166-1 alpha-2 2-character string specified in "
+        "https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.",
+        default=None,
+        json_schema_extra={
+            "enumMeta": {
+                code: {
+                    "title": GEOLOCATION_OPTIONS_WITH_CODE[code],
+                }
+                for code in Geolocation
+            }
+        },
+    )
+    max_requests: Optional[int] = Field(
+        description=(
+            "The maximum number of Zyte API requests allowed for the crawl.\n"
+            "\n"
+            "Requests with error responses that cannot be retried or exceed "
+            "their retry limit also count here, but they incur in no costs "
+            "and do not increase the request count in Scrapy Cloud."
+        ),
+        default=100,
+        json_schema_extra={
+            "widget": "request-limit",
+        },
+    )
+    crawl_strategy: CrawlStrategy = Field(
+        title="Crawl strategy",
+        description="Determines how the start URL and follow-up URLs are crawled.",
+        default=CrawlStrategy.navigation,
+        json_schema_extra={
+            "enumMeta": {
+                CrawlStrategy.full: {
+                    "title": "Full",
+                    "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
+                },
+                CrawlStrategy.navigation: {
+                    "title": "Navigation",
+                    "description": "Follow pagination, subcategories, and product detail pages.",
+                },
+                CrawlStrategy.pagination_only: {
+                    "title": "Pagination Only",
+                    "description": (
+                        "Follow pagination and product detail pages. SubCategory links are ignored. "
+                        "Use this when some subCategory links are misidentified by ML-extraction."
+                    ),
+                },
+            },
+        },
+    )
+    extract_from: Optional[ExtractFrom] = Field(
+        title="Extraction source",
+        description=(
+            "Whether to perform extraction using a browser request "
+            "(browserHtml) or an HTTP request (httpResponseBody)."
+        ),
+        default=None,
+        json_schema_extra={
+            "enumMeta": {
+                ExtractFrom.browserHtml: {
+                    "title": "browserHtml",
+                    "description": "Use browser rendering. Often provides the best quality.",
+                },
+                ExtractFrom.httpResponseBody: {
+                    "title": "httpResponseBody",
+                    "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
+                },
+            },
+        },
+    )
+
+
+def make_params(
+    cls_name,
+    params,
+    *,
+    default=None,
+    required=None,
+    set_args=None,
+):
+    fields = {}
+    default = default or {}
+    required = set(required) if required else set()
+    for param in params:
+        field = AllParams.model_fields[param]
+        if field in required:
+            field.default = PydanticUndefined
+        else:
+            try:
+                field.default = default[param]
+            except KeyError:
+                pass
+        fields[param] = (field.annotation, field)
+    model = create_model(
+        cls_name,
+        __config__=ConfigDict(extra="forbid"),
+        **fields,
+    )
+    if set_args:
+        model.set_args = set_args
+    return model
diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py
index e64faa0..0cb9f9c 100644
--- a/zyte_spider_templates/spiders/base.py
+++ b/zyte_spider_templates/spiders/base.py
@@ -1,55 +1,13 @@
 from importlib.metadata import version
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 
 import scrapy
-from pydantic import BaseModel, Field
 from scrapy.crawler import Crawler
 
-from zyte_spider_templates._geolocations import (
-    GEOLOCATION_OPTIONS_WITH_CODE,
-    Geolocation,
-)
-
 # Higher priority than command-line-defined settings (40).
 ARG_SETTING_PRIORITY: int = 50
 
 
-class BaseSpiderParams(BaseModel):
-    url: str = Field(
-        title="URL",
-        description="Initial URL for the crawl. Enter the full URL including http(s), "
-        "you can copy and paste it from your browser. Example: https://toscrape.com/",
-        pattern=r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
-    )
-    geolocation: Optional[Geolocation] = Field(
-        title="Geolocation",
-        description="ISO 3166-1 alpha-2 2-character string specified in "
-        "https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.",
-        default=None,
-        json_schema_extra={
-            "enumMeta": {
-                code: {
-                    "title": GEOLOCATION_OPTIONS_WITH_CODE[code],
-                }
-                for code in Geolocation
-            }
-        },
-    )
-    max_requests: Optional[int] = Field(
-        description=(
-            "The maximum number of Zyte API requests allowed for the crawl.\n"
-            "\n"
-            "Requests with error responses that cannot be retried or exceed "
-            "their retry limit also count here, but they incur in no costs "
-            "and do not increase the request count in Scrapy Cloud."
-        ),
-        default=100,
-        json_schema_extra={
-            "widget": "request-limit",
-        },
-    )
-
-
 class BaseSpider(scrapy.Spider):
     custom_settings: Dict[str, Any] = {
         "ZYTE_API_TRANSPARENT_MODE": True,
diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py
index 4de10ea..ee298d5 100644
--- a/zyte_spider_templates/spiders/ecommerce.py
+++ b/zyte_spider_templates/spiders/ecommerce.py
@@ -1,93 +1,27 @@
-from enum import Enum
 from typing import Any, Callable, Dict, Iterable, Optional, Union
 
 import scrapy
-from pydantic import Field
 from scrapy import Request
 from scrapy.crawler import Crawler
 from scrapy_poet import DummyResponse
 from scrapy_spider_metadata import Args
 from zyte_common_items import ProbabilityRequest, Product, ProductNavigation
 
-from zyte_spider_templates.documentation import document_enum
-from zyte_spider_templates.spiders.base import (
-    ARG_SETTING_PRIORITY,
-    BaseSpider,
-    BaseSpiderParams,
-)
+from zyte_spider_templates import BaseSpider, make_params
+from zyte_spider_templates.params import CrawlStrategy
+from zyte_spider_templates.spiders.base import ARG_SETTING_PRIORITY
 from zyte_spider_templates.utils import get_domain
 
-
-@document_enum
-class EcommerceCrawlStrategy(str, Enum):
-    full: str = "full"
-    """Follow most links within the domain of URL in an attempt to discover and
-    extract as many products as possible."""
-
-    navigation: str = "navigation"
-    """Follow pagination, subcategories, and product detail pages."""
-
-    pagination_only: str = "pagination_only"
-    """Follow pagination and product detail pages. SubCategory links are
-    ignored. Use this when some subCategory links are misidentified by
-    ML-extraction."""
-
-
-@document_enum
-class ExtractFrom(str, Enum):
-    httpResponseBody: str = "httpResponseBody"
-    """Use HTTP responses. Cost-efficient and fast extraction method, which
-    works well on many websites."""
-
-    browserHtml: str = "browserHtml"
-    """Use browser rendering. Often provides the best quality."""
-
-
-class EcommerceSpiderParams(BaseSpiderParams):
-    crawl_strategy: EcommerceCrawlStrategy = Field(
-        title="Crawl strategy",
-        description="Determines how the start URL and follow-up URLs are crawled.",
-        default=EcommerceCrawlStrategy.navigation,
-        json_schema_extra={
-            "enumMeta": {
-                EcommerceCrawlStrategy.full: {
-                    "title": "Full",
-                    "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
-                },
-                EcommerceCrawlStrategy.navigation: {
-                    "title": "Navigation",
-                    "description": "Follow pagination, subcategories, and product detail pages.",
-                },
-                EcommerceCrawlStrategy.pagination_only: {
-                    "title": "Pagination Only",
-                    "description": (
-                        "Follow pagination and product detail pages. SubCategory links are ignored. "
-                        "Use this when some subCategory links are misidentified by ML-extraction."
-                    ),
-                },
-            },
-        },
-    )
-    extract_from: Optional[ExtractFrom] = Field(
-        title="Extraction source",
-        description=(
-            "Whether to perform extraction using a browser request "
-            "(browserHtml) or an HTTP request (httpResponseBody)."
-        ),
-        default=None,
-        json_schema_extra={
-            "enumMeta": {
-                ExtractFrom.browserHtml: {
-                    "title": "browserHtml",
-                    "description": "Use browser rendering. Often provides the best quality.",
-                },
-                ExtractFrom.httpResponseBody: {
-                    "title": "httpResponseBody",
-                    "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
-                },
-            },
-        },
-    )
+EcommerceSpiderParams = make_params(
+    "EcommerceSpiderParams",
+    [
+        "url",
+        "geolocation",
+        "max_requests",
+        "crawl_strategy",
+        "extract_from",
+    ],
+)
 
 
 class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider):
@@ -144,7 +78,7 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
 
     def start_requests(self) -> Iterable[Request]:
         page_params = {}
-        if self.args.crawl_strategy == EcommerceCrawlStrategy.full:
+        if self.args.crawl_strategy == CrawlStrategy.full:
             page_params = {"full_domain": self.allowed_domains[0]}
 
         yield Request(
@@ -174,7 +108,7 @@ def parse_navigation(
             else:
                 yield self.get_nextpage_request(navigation.nextPage)
 
-        if self.args.crawl_strategy != EcommerceCrawlStrategy.pagination_only:
+        if self.args.crawl_strategy != CrawlStrategy.pagination_only:
             for request in navigation.subCategories or []:
                 yield self.get_subcategory_request(request, page_params=page_params)