zytedata · Gallaecio · Feb 6, 2024 · Gallaecio · Feb 9, 2024
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
@@ -5,13 +5,16 @@ Reference
 Base classes
 ============
 
-.. autopydantic_model:: zyte_spider_templates.spiders.base.BaseSpiderParams
+.. autoclass:: zyte_spider_templates.spiders.base.BaseSpider
+
+.. autopydantic_model:: zyte_spider_templates.params.AllParams
     :inherited-members: BaseModel
 
-.. autoclass:: zyte_spider_templates.spiders.base.BaseSpider
+.. autoenum:: zyte_spider_templates.params.CrawlStrategy
 
-.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
-    :noindex:
+.. autoenum:: zyte_spider_templates.params.ExtractFrom
+
+.. autoenum:: zyte_spider_templates.params.Geolocation
 
 E-commerce
 ==========
@@ -20,15 +23,6 @@ E-commerce
     :noindex:
     :inherited-members: BaseModel
 
-.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
-    :noindex:
-
-.. autoclass:: zyte_spider_templates.spiders.ecommerce.ExtractFrom
-    :noindex:
-
-.. autoclass:: zyte_spider_templates.spiders.ecommerce.EcommerceSpider
-    :noindex:
-
 Pages
 =====
 

diff --git a/docs/templates/e-commerce.rst b/docs/templates/e-commerce.rst
@@ -16,9 +16,3 @@ Parameters
 
 .. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams
     :inherited-members: BaseModel
-
-.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
-
-.. autoenum:: zyte_spider_templates.spiders.ecommerce.ExtractFrom
-
-.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
         "pydantic>=2",
         "scrapy>=2.11.0",
         "scrapy-poet>=0.20.1",
-        "scrapy-spider-metadata>=0.1.2",
+        "scrapy-spider-metadata @ git+https://github.com/Gallaecio/scrapy-spider-metadata.git@param-inheritance",
         "scrapy-zyte-api[provider]>=0.15.0",
         "zyte-common-items>=0.13.0",
     ],

diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py
@@ -9,16 +9,13 @@
 from scrapy_spider_metadata import get_spider_metadata
 from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request
 
-from zyte_spider_templates import BaseSpiderParams
 from zyte_spider_templates._geolocations import (
     GEOLOCATION_OPTIONS,
     GEOLOCATION_OPTIONS_WITH_CODE,
     Geolocation,
 )
-from zyte_spider_templates.spiders.ecommerce import (
-    EcommerceCrawlStrategy,
-    EcommerceSpider,
-)
+from zyte_spider_templates.params import AllParams, CrawlStrategy
+from zyte_spider_templates.spiders.ecommerce import EcommerceSpider
 
 from . import get_crawler
 from .test_utils import URL_TO_DOMAIN
@@ -29,9 +26,7 @@ def test_parameters():
         EcommerceSpider()
 
     EcommerceSpider(url="https://example.com")
-    EcommerceSpider(
-        url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.full
-    )
+    EcommerceSpider(url="https://example.com", crawl_strategy=CrawlStrategy.full)
     EcommerceSpider(url="https://example.com", crawl_strategy="full")
 
     with pytest.raises(ValidationError):
@@ -354,6 +349,7 @@ def test_metadata():
         "title": "E-commerce",
         "description": "Template for spiders that extract product data from e-commerce websites.",
         "param_schema": {
+            "additionalProperties": False,
             "properties": {
                 "crawl_strategy": {
                     "default": "navigation",
@@ -494,7 +490,7 @@ def test_metadata():
     ],
 )
 def test_validation_url(url, valid):
-    url_re = BaseSpiderParams.model_fields["url"].metadata[0].pattern
+    url_re = AllParams.model_fields["url"].metadata[0].pattern
     assert bool(re.match(url_re, url)) == valid
 
 

diff --git a/zyte_spider_templates/__init__.py b/zyte_spider_templates/__init__.py
@@ -1,2 +1,3 @@
-from .spiders.base import BaseSpider, BaseSpiderParams
+from .params import make_params
+from .spiders.base import BaseSpider
 from .spiders.ecommerce import EcommerceSpider
diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py
@@ -0,0 +1,147 @@
+from enum import Enum
+from typing import Optional
+
+from pydantic import BaseModel, ConfigDict, Field, create_model
+from pydantic.fields import PydanticUndefined
+
+from zyte_spider_templates._geolocations import (
+    GEOLOCATION_OPTIONS_WITH_CODE,
+    Geolocation,
+)
+from zyte_spider_templates.documentation import document_enum
+
+
+@document_enum
+class CrawlStrategy(str, Enum):
+    full: str = "full"
+    """Follow most links within the domain of URL in an attempt to discover and
+    extract as many products as possible."""
+
+    navigation: str = "navigation"
+    """Follow pagination, subcategories, and product detail pages."""
+
+    pagination_only: str = "pagination_only"
+    """Follow pagination and product detail pages. SubCategory links are
+    ignored. Use this when some subCategory links are misidentified by
+    ML-extraction."""
+
+
+@document_enum
+class ExtractFrom(str, Enum):
+    httpResponseBody: str = "httpResponseBody"
+    """Use HTTP responses. Cost-efficient and fast extraction method, which
+    works well on many websites."""
+
+    browserHtml: str = "browserHtml"
+    """Use browser rendering. Often provides the best quality."""
+
+
+class AllParams(BaseModel):
+    url: str = Field(
+        title="URL",
+        description="Initial URL for the crawl. Enter the full URL including http(s), "
+        "you can copy and paste it from your browser. Example: https://toscrape.com/",
+        pattern=r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
+    )
+    geolocation: Optional[Geolocation] = Field(
+        title="Geolocation",
+        description="ISO 3166-1 alpha-2 2-character string specified in "
+        "https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.",
+        default=None,
+        json_schema_extra={
+            "enumMeta": {
+                code: {
+                    "title": GEOLOCATION_OPTIONS_WITH_CODE[code],
+                }
+                for code in Geolocation
+            }
+        },
+    )
+    max_requests: Optional[int] = Field(
+        description=(
+            "The maximum number of Zyte API requests allowed for the crawl.\n"
+            "\n"
+            "Requests with error responses that cannot be retried or exceed "
+            "their retry limit also count here, but they incur in no costs "
+            "and do not increase the request count in Scrapy Cloud."
+        ),
+        default=100,
+        json_schema_extra={
+            "widget": "request-limit",
+        },
+    )
+    crawl_strategy: CrawlStrategy = Field(
+        title="Crawl strategy",
+        description="Determines how the start URL and follow-up URLs are crawled.",
+        default=CrawlStrategy.navigation,
+        json_schema_extra={
+            "enumMeta": {
+                CrawlStrategy.full: {
+                    "title": "Full",
+                    "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
+                },
+                CrawlStrategy.navigation: {
+                    "title": "Navigation",
+                    "description": "Follow pagination, subcategories, and product detail pages.",
+                },
+                CrawlStrategy.pagination_only: {
+                    "title": "Pagination Only",
+                    "description": (
+                        "Follow pagination and product detail pages. SubCategory links are ignored. "
+                        "Use this when some subCategory links are misidentified by ML-extraction."
+                    ),
+                },
+            },
+        },
+    )
+    extract_from: Optional[ExtractFrom] = Field(
+        title="Extraction source",
+        description=(
+            "Whether to perform extraction using a browser request "
+            "(browserHtml) or an HTTP request (httpResponseBody)."
+        ),
+        default=None,
+        json_schema_extra={
+            "enumMeta": {
+                ExtractFrom.browserHtml: {
+                    "title": "browserHtml",
+                    "description": "Use browser rendering. Often provides the best quality.",
+                },
+                ExtractFrom.httpResponseBody: {
+                    "title": "httpResponseBody",
+                    "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
+                },
+            },
+        },
+    )
+
+
+def make_params(
+    cls_name,
+    params,
+    *,
+    default=None,
+    required=None,
+    set_args=None,
+):
+    fields = {}
+    default = default or {}
+    required = set(required) if required else set()
+    for param in params:
+        field = AllParams.model_fields[param]
+        if field in required:
+            field.default = PydanticUndefined
+        else:
+            try:
+                field.default = default[param]
+            except KeyError:
+                pass
+        fields[param] = (field.annotation, field)
+    model = create_model(
+        cls_name,
+        __config__=ConfigDict(extra="forbid"),
+        **fields,
+    )
+    if set_args:
+        model.set_args = set_args
+    return model
diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py
@@ -1,55 +1,13 @@
 from importlib.metadata import version
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 
 import scrapy
-from pydantic import BaseModel, Field
 from scrapy.crawler import Crawler
 
-from zyte_spider_templates._geolocations import (
-    GEOLOCATION_OPTIONS_WITH_CODE,
-    Geolocation,
-)
-
 # Higher priority than command-line-defined settings (40).
 ARG_SETTING_PRIORITY: int = 50
 
 
-class BaseSpiderParams(BaseModel):
-    url: str = Field(
-        title="URL",
-        description="Initial URL for the crawl. Enter the full URL including http(s), "
-        "you can copy and paste it from your browser. Example: https://toscrape.com/",
-        pattern=r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
-    )
-    geolocation: Optional[Geolocation] = Field(
-        title="Geolocation",
-        description="ISO 3166-1 alpha-2 2-character string specified in "
-        "https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.",
-        default=None,
-        json_schema_extra={
-            "enumMeta": {
-                code: {
-                    "title": GEOLOCATION_OPTIONS_WITH_CODE[code],
-                }
-                for code in Geolocation
-            }
-        },
-    )
-    max_requests: Optional[int] = Field(
-        description=(
-            "The maximum number of Zyte API requests allowed for the crawl.\n"
-            "\n"
-            "Requests with error responses that cannot be retried or exceed "
-            "their retry limit also count here, but they incur in no costs "
-            "and do not increase the request count in Scrapy Cloud."
-        ),
-        default=100,
-        json_schema_extra={
-            "widget": "request-limit",
-        },
-    )
-
-
 class BaseSpider(scrapy.Spider):
     custom_settings: Dict[str, Any] = {
         "ZYTE_API_TRANSPARENT_MODE": True,