Skip to content

Commit

Permalink
Full crawl by default, reword strategies (#40)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Feb 15, 2024
1 parent f326a39 commit 5fcb399
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 13 deletions.
16 changes: 11 additions & 5 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ def test_parameters():


def test_start_requests():
crawler = get_crawler()
url = "https://example.com"
spider = EcommerceSpider(url=url)
spider = EcommerceSpider.from_crawler(crawler, url=url)
requests = list(spider.start_requests())
assert len(requests) == 1
assert requests[0].url == url
Expand Down Expand Up @@ -356,7 +357,7 @@ def test_metadata():
"param_schema": {
"properties": {
"crawl_strategy": {
"default": "navigation",
"default": "full",
"title": "Crawl strategy",
"description": "Determines how the start URL and follow-up URLs are crawled.",
"type": "string",
Expand All @@ -367,13 +368,18 @@ def test_metadata():
"title": "Full",
},
"navigation": {
"description": "Follow pagination, subcategories, and product detail pages.",
"description": (
"Follow pagination, subcategories, and "
"product detail pages. Pagination Only is a "
"better choice if the target URL does not "
"have subcategories, or if Zyte API is "
"misidentifying some URLs as subcategories."
),
"title": "Navigation",
},
"pagination_only": {
"description": (
"Follow pagination and product detail pages. SubCategory links are ignored. "
"Use this when some subCategory links are misidentified by ML-extraction."
"Follow pagination and product detail pages. Subcategory links are ignored."
),
"title": "Pagination Only",
},
Expand Down
23 changes: 15 additions & 8 deletions zyte_spider_templates/spiders/ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,22 @@ class EcommerceCrawlStrategy(str, Enum):
extract as many products as possible."""

navigation: str = "navigation"
"""Follow pagination, subcategories, and product detail pages."""
"""Follow pagination, subcategories, and product detail pages.
Pagination Only is a better choice if the target URL does not have
subcategories, or if Zyte API is misidentifying some URLs as subcategories.
"""

pagination_only: str = "pagination_only"
"""Follow pagination and product detail pages. SubCategory links are
ignored. Use this when some subCategory links are misidentified by
ML-extraction."""
"""Follow pagination and product detail pages. Subcategory links are
ignored."""


class EcommerceSpiderParams(BaseSpiderParams):
crawl_strategy: EcommerceCrawlStrategy = Field(
title="Crawl strategy",
description="Determines how the start URL and follow-up URLs are crawled.",
default=EcommerceCrawlStrategy.navigation,
default=EcommerceCrawlStrategy.full,
json_schema_extra={
"enumMeta": {
EcommerceCrawlStrategy.full: {
Expand All @@ -46,13 +49,17 @@ class EcommerceSpiderParams(BaseSpiderParams):
},
EcommerceCrawlStrategy.navigation: {
"title": "Navigation",
"description": "Follow pagination, subcategories, and product detail pages.",
"description": (
"Follow pagination, subcategories, and product detail "
"pages. Pagination Only is a better choice if the "
"target URL does not have subcategories, or if Zyte "
"API is misidentifying some URLs as subcategories."
),
},
EcommerceCrawlStrategy.pagination_only: {
"title": "Pagination Only",
"description": (
"Follow pagination and product detail pages. SubCategory links are ignored. "
"Use this when some subCategory links are misidentified by ML-extraction."
"Follow pagination and product detail pages. Subcategory links are ignored."
),
},
},
Expand Down

0 comments on commit 5fcb399

Please sign in to comment.