Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Define standard params #34

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 7 additions & 13 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@ Reference
Base classes
============

.. autopydantic_model:: zyte_spider_templates.spiders.base.BaseSpiderParams
.. autoclass:: zyte_spider_templates.spiders.base.BaseSpider

.. autopydantic_model:: zyte_spider_templates.params.AllParams
:inherited-members: BaseModel

.. autoclass:: zyte_spider_templates.spiders.base.BaseSpider
.. autoenum:: zyte_spider_templates.params.CrawlStrategy

.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
:noindex:
.. autoenum:: zyte_spider_templates.params.ExtractFrom

.. autoenum:: zyte_spider_templates.params.Geolocation

E-commerce
==========
Expand All @@ -20,15 +23,6 @@ E-commerce
:noindex:
:inherited-members: BaseModel

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
:noindex:

.. autoclass:: zyte_spider_templates.spiders.ecommerce.ExtractFrom
:noindex:

.. autoclass:: zyte_spider_templates.spiders.ecommerce.EcommerceSpider
:noindex:

Pages
=====

Expand Down
6 changes: 0 additions & 6 deletions docs/templates/e-commerce.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,3 @@ Parameters

.. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams
:inherited-members: BaseModel

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy

.. autoenum:: zyte_spider_templates.spiders.ecommerce.ExtractFrom

.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"pydantic>=2",
"scrapy>=2.11.0",
"scrapy-poet>=0.20.1",
"scrapy-spider-metadata>=0.1.2",
"scrapy-spider-metadata @ git+https://github.com/Gallaecio/scrapy-spider-metadata.git@param-inheritance",
"scrapy-zyte-api[provider]>=0.15.0",
"zyte-common-items>=0.13.0",
],
Expand Down
14 changes: 5 additions & 9 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,13 @@
from scrapy_spider_metadata import get_spider_metadata
from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request

from zyte_spider_templates import BaseSpiderParams
from zyte_spider_templates._geolocations import (
GEOLOCATION_OPTIONS,
GEOLOCATION_OPTIONS_WITH_CODE,
Geolocation,
)
from zyte_spider_templates.spiders.ecommerce import (
EcommerceCrawlStrategy,
EcommerceSpider,
)
from zyte_spider_templates.params import AllParams, CrawlStrategy
from zyte_spider_templates.spiders.ecommerce import EcommerceSpider

from . import get_crawler
from .test_utils import URL_TO_DOMAIN
Expand All @@ -29,9 +26,7 @@ def test_parameters():
EcommerceSpider()

EcommerceSpider(url="https://example.com")
EcommerceSpider(
url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.full
)
EcommerceSpider(url="https://example.com", crawl_strategy=CrawlStrategy.full)
EcommerceSpider(url="https://example.com", crawl_strategy="full")

with pytest.raises(ValidationError):
Expand Down Expand Up @@ -354,6 +349,7 @@ def test_metadata():
"title": "E-commerce",
"description": "Template for spiders that extract product data from e-commerce websites.",
"param_schema": {
"additionalProperties": False,
"properties": {
"crawl_strategy": {
"default": "navigation",
Expand Down Expand Up @@ -494,7 +490,7 @@ def test_metadata():
],
)
def test_validation_url(url, valid):
url_re = BaseSpiderParams.model_fields["url"].metadata[0].pattern
url_re = AllParams.model_fields["url"].metadata[0].pattern
assert bool(re.match(url_re, url)) == valid


Expand Down
3 changes: 2 additions & 1 deletion zyte_spider_templates/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .spiders.base import BaseSpider, BaseSpiderParams
from .params import make_params
from .spiders.base import BaseSpider
from .spiders.ecommerce import EcommerceSpider
147 changes: 147 additions & 0 deletions zyte_spider_templates/params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
from enum import Enum
from typing import Optional

from pydantic import BaseModel, ConfigDict, Field, create_model
from pydantic.fields import PydanticUndefined

from zyte_spider_templates._geolocations import (
GEOLOCATION_OPTIONS_WITH_CODE,
Geolocation,
)
from zyte_spider_templates.documentation import document_enum


@document_enum
class CrawlStrategy(str, Enum):
full: str = "full"
"""Follow most links within the domain of URL in an attempt to discover and
extract as many products as possible."""

navigation: str = "navigation"
"""Follow pagination, subcategories, and product detail pages."""

pagination_only: str = "pagination_only"
"""Follow pagination and product detail pages. SubCategory links are
ignored. Use this when some subCategory links are misidentified by
ML-extraction."""


@document_enum
class ExtractFrom(str, Enum):
httpResponseBody: str = "httpResponseBody"
"""Use HTTP responses. Cost-efficient and fast extraction method, which
works well on many websites."""

browserHtml: str = "browserHtml"
"""Use browser rendering. Often provides the best quality."""


class AllParams(BaseModel):
url: str = Field(
title="URL",
description="Initial URL for the crawl. Enter the full URL including http(s), "
"you can copy and paste it from your browser. Example: https://toscrape.com/",
pattern=r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
)
geolocation: Optional[Geolocation] = Field(
title="Geolocation",
description="ISO 3166-1 alpha-2 2-character string specified in "
"https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.",
default=None,
json_schema_extra={
"enumMeta": {
code: {
"title": GEOLOCATION_OPTIONS_WITH_CODE[code],
}
for code in Geolocation
}
},
)
max_requests: Optional[int] = Field(
description=(
"The maximum number of Zyte API requests allowed for the crawl.\n"
"\n"
"Requests with error responses that cannot be retried or exceed "
"their retry limit also count here, but they incur in no costs "
"and do not increase the request count in Scrapy Cloud."
),
default=100,
json_schema_extra={
"widget": "request-limit",
},
)
crawl_strategy: CrawlStrategy = Field(
title="Crawl strategy",
description="Determines how the start URL and follow-up URLs are crawled.",
default=CrawlStrategy.navigation,
json_schema_extra={
"enumMeta": {
CrawlStrategy.full: {
"title": "Full",
"description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
},
CrawlStrategy.navigation: {
"title": "Navigation",
"description": "Follow pagination, subcategories, and product detail pages.",
},
CrawlStrategy.pagination_only: {
"title": "Pagination Only",
"description": (
"Follow pagination and product detail pages. SubCategory links are ignored. "
"Use this when some subCategory links are misidentified by ML-extraction."
),
},
},
},
)
extract_from: Optional[ExtractFrom] = Field(
title="Extraction source",
description=(
"Whether to perform extraction using a browser request "
"(browserHtml) or an HTTP request (httpResponseBody)."
),
default=None,
json_schema_extra={
"enumMeta": {
ExtractFrom.browserHtml: {
"title": "browserHtml",
"description": "Use browser rendering. Often provides the best quality.",
},
ExtractFrom.httpResponseBody: {
"title": "httpResponseBody",
"description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
},
},
},
)


def make_params(
cls_name,
params,
*,
default=None,
required=None,
set_args=None,
):
fields = {}
default = default or {}
required = set(required) if required else set()
for param in params:
field = AllParams.model_fields[param]
if field in required:
field.default = PydanticUndefined
else:
try:
field.default = default[param]
except KeyError:
pass
fields[param] = (field.annotation, field)
model = create_model(
cls_name,
__config__=ConfigDict(extra="forbid"),
**fields,
)
if set_args:
model.set_args = set_args
return model
Comment on lines +39 to +147
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before further progress, I would like to decide if this is the way we want to approach this, i.e. a single model with all param definitions and a function to create a model with a subset of that.

44 changes: 1 addition & 43 deletions zyte_spider_templates/spiders/base.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,13 @@
from importlib.metadata import version
from typing import Any, Dict, Optional
from typing import Any, Dict

import scrapy
from pydantic import BaseModel, Field
from scrapy.crawler import Crawler

from zyte_spider_templates._geolocations import (
GEOLOCATION_OPTIONS_WITH_CODE,
Geolocation,
)

# Higher priority than command-line-defined settings (40).
ARG_SETTING_PRIORITY: int = 50


class BaseSpiderParams(BaseModel):
url: str = Field(
title="URL",
description="Initial URL for the crawl. Enter the full URL including http(s), "
"you can copy and paste it from your browser. Example: https://toscrape.com/",
pattern=r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
)
geolocation: Optional[Geolocation] = Field(
title="Geolocation",
description="ISO 3166-1 alpha-2 2-character string specified in "
"https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.",
default=None,
json_schema_extra={
"enumMeta": {
code: {
"title": GEOLOCATION_OPTIONS_WITH_CODE[code],
}
for code in Geolocation
}
},
)
max_requests: Optional[int] = Field(
description=(
"The maximum number of Zyte API requests allowed for the crawl.\n"
"\n"
"Requests with error responses that cannot be retried or exceed "
"their retry limit also count here, but they incur in no costs "
"and do not increase the request count in Scrapy Cloud."
),
default=100,
json_schema_extra={
"widget": "request-limit",
},
)


class BaseSpider(scrapy.Spider):
custom_settings: Dict[str, Any] = {
"ZYTE_API_TRANSPARENT_MODE": True,
Expand Down
Loading
Loading