From dfb4df0826c1cc1e2e490d2ba9e80081044e734b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 7 Jan 2025 18:34:39 +0100 Subject: [PATCH 1/2] Basic e-commerce sitemap support --- setup.py | 1 + tests/.gitignore | 1 + tests/mockserver.py | 114 +++++++++- tests/test_ecommerce.py | 248 ++++++++++++++++++++- tox.ini | 1 + zyte_spider_templates/spiders/ecommerce.py | 185 ++++++++++++++- 6 files changed, 525 insertions(+), 25 deletions(-) create mode 100644 tests/.gitignore diff --git a/setup.py b/setup.py index 64532cb..0d52461 100644 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ "form2request>=0.2.0", "formasaurus>=0.10.0", "jmespath>=0.9.5", + "protego>=0.3.0", "pydantic>=2.1", "requests>=2.31.0", "scrapinghub >= 2.4.0", diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 0000000..e6b9a97 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1 @@ +/fs.example/ diff --git a/tests/mockserver.py b/tests/mockserver.py index fe2ff20..b7afdcf 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -3,9 +3,12 @@ import socket import sys import time +from base64 import b64encode from importlib import import_module +from pathlib import Path from subprocess import PIPE, Popen from typing import Any, Dict +from urllib.parse import urlparse from scrapy_zyte_api.responses import _API_RESPONSE from twisted.internet import reactor @@ -36,20 +39,30 @@ class DefaultResource(Resource): https://example.com/category/1 https://example.com/category/1/page/2 https://example.com/non-navigation + https://example.com/sitemap-category + https://example.com/sitemap-product/1 + https://example.com/sitemap-product/2 ``` When browserHtml is requested (for any URL, listed above or not), it is a minimal HTML with an anchor tag pointing to https://example.com/non-navigation. - When productNavigation is requested, nextPage and subCategories are filled - accordingly. productNavigation.items always has 2 product URLs, which are - the result of appending ``/product/`` to the request URL. - https://example.com/non-navigation is not reachable through - productNavigation. + When productNavigation is requested, nextPage and subCategories are + filled accordingly. productNavigation.items always has 2 product URLs, + which are the result of appending ``/product/`` to the request URL. - When product or productList is requested, an item with the current URL is - always returned. + The following pages are not reachable through productNavigation: + + - https://example.com/non-navigation is in an a element of the HTML + of every page. + + - https://example.com/sitemap-* URLs are linked from + https://example.com/sitemap*.xml, which are linked from + https://example.com/robots.txt. + + When product or productList is requested, an item with the current URL + is always returned. All output also includes unsupported links (mailto:…). @@ -61,6 +74,16 @@ class DefaultResource(Resource): - https://jobs.offsite.example/jobs/1 (jobPosting) - https://jobs.offsite.example/jobs/2 (jobPosting) + + - For fs.example subdomains, a matching file is looked for in the file + system. If found, its content is base64-encoded and returned as + httpResponseBody. Else, a product response is returned. + + For example, for the URL https://abcdefg.fs.example/foo, if a file + exists at tests/fs.example/abcdefg/foo, its content is returned as + httpResponseBody. Otherwise, the response is as empty as possible based + on input parameters, except when requesting productNavigation, which + always includes an item at /p. """ def getChild(self, path, request): @@ -99,6 +122,82 @@ def render_POST(self, request): } return json.dumps(response_data).encode() + if request_data["url"] == "https://example.com/robots.txt": + assert request_data["httpResponseBody"] is True + body = b""" + Sitemap: https://example.com/sitemap.xml # Link to category + SiTeMaP: https://example.com/sitemap-index.xml # Links to products + """ + response_data["httpResponseBody"] = b64encode(body).decode() + return json.dumps(response_data).encode() + + if request_data["url"] == "https://example.com/sitemap.xml": + assert request_data["httpResponseBody"] is True + body = b""" + + + + https://example.com/sitemap-category + + + """ + response_data["httpResponseBody"] = b64encode(body).decode() + return json.dumps(response_data).encode() + + if request_data["url"] == "https://example.com/sitemap-index.xml": + assert request_data["httpResponseBody"] is True + body = b""" + + + + https://example.com/sitemap-products.xml + + + """ + response_data["httpResponseBody"] = b64encode(body).decode() + return json.dumps(response_data).encode() + + if request_data["url"] == "https://example.com/sitemap-products.xml": + assert request_data["httpResponseBody"] is True + body = b""" + + + + https://example.com/sitemap-product/1 + + + https://example.com/sitemap-product/2 + + + """ + response_data["httpResponseBody"] = b64encode(body).decode() + return json.dumps(response_data).encode() + + if "fs.example" in request_data["url"]: + parsed_url = urlparse(request_data["url"]) + subdir_name = parsed_url.netloc[: -len(".fs.example")] + root_dir = Path(__file__).parent / "fs.example" + subdir = root_dir / subdir_name + filepath = subdir / parsed_url.path.lstrip("/") + if filepath != subdir and filepath.exists(): + response_data["httpResponseBody"] = b64encode( + filepath.read_bytes() + ).decode() + else: + if request_data.get("browserHtml", False) is True: + response_data["browserHtml"] = "a" + if request_data.get("product", False) is True: + response_data["product"] = {"url": response_data["url"]} + if request_data.get("productNavigation", False) is True: + items = [] + if filepath != subdir: + items = [{"url": f"{request_data['url'].rstrip('/')}/p"}] + response_data["productNavigation"] = { + "url": response_data["url"], + "items": items, + } + return json.dumps(response_data).encode() + non_navigation_url = "https://example.com/non-navigation" html = f"""""" if request_data.get("browserHtml", False) is True: @@ -119,6 +218,7 @@ def render_POST(self, request): if ( "/page/" not in request_data["url"] and "/non-navigation" not in request_data["url"] + and "/sitemap" not in request_data["url"] ): kwargs["nextPage"] = { "url": f"{request_data['url'].rstrip('/')}/page/2" diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index b64fcb4..6f40606 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -1,6 +1,11 @@ import logging +from pathlib import Path +from random import choice +from shutil import rmtree +from string import ascii_lowercase from typing import Iterable, List, cast from unittest.mock import MagicMock, call, patch +from urllib.parse import urlparse import pytest import requests @@ -35,9 +40,11 @@ def test_start_requests(): crawler = get_crawler() spider = EcommerceSpider.from_crawler(crawler, url=url) requests = list(spider.start_requests()) - assert len(requests) == 1 - assert requests[0].url == url - assert requests[0].callback == spider.parse_navigation + assert len(requests) == 2 + assert requests[0].url == "https://example.com/robots.txt" + assert requests[0].callback == spider.parse_robotstxt + assert requests[1].url == url + assert requests[1].callback == spider.parse_navigation def test_start_requests_crawling_logs_page_type(): @@ -46,7 +53,8 @@ def test_start_requests_crawling_logs_page_type(): spider = EcommerceSpider.from_crawler(crawler, url=url) requests = list(spider.start_requests()) - assert requests[0].meta["crawling_logs"]["page_type"] == "productNavigation" + assert requests[0].meta["crawling_logs"]["page_type"] == "robots.txt" + assert requests[1].meta["crawling_logs"]["page_type"] == "productNavigation" spider = EcommerceSpider.from_crawler( crawler, url=url, crawl_strategy="direct_item" @@ -261,6 +269,10 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]: "https://example.com/category/1/page/2/product/2", "https://example.com/non-navigation/product/1", "https://example.com/non-navigation/product/2", + "https://example.com/sitemap-category/product/1", + "https://example.com/sitemap-category/product/2", + "https://example.com/sitemap-product/1", + "https://example.com/sitemap-product/2", }, ) for crawl_strategy_args in ({}, {"crawl_strategy": "automatic"}) @@ -283,6 +295,22 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]: for crawl_strategy_args in ({}, {"crawl_strategy": "automatic"}) for extract_args in ({}, {"extract": "product"}) ), + # automatic works like direct_item for product-like URLs when + # extracting products. + *( + ( + { + "url": "https://example.com/product/1", + **crawl_strategy_args, + **extract_args, + }, + { + "https://example.com/product/1", + }, + ) + for crawl_strategy_args in ({}, {"crawl_strategy": "automatic"}) + for extract_args in ({}, {"extract": "product"}) + ), *( ( { @@ -301,6 +329,10 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]: "https://example.com/category/1/page/2/product/2", "https://example.com/non-navigation/product/1", "https://example.com/non-navigation/product/2", + "https://example.com/sitemap-category/product/1", + "https://example.com/sitemap-category/product/2", + "https://example.com/sitemap-product/1", + "https://example.com/sitemap-product/2", }, ) for extract_args in ({}, {"extract": "product"}) @@ -319,6 +351,10 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]: "https://example.com/category/1/page/2/product/2", "https://example.com/non-navigation/product/1", "https://example.com/non-navigation/product/2", + "https://example.com/sitemap-category/product/1", + "https://example.com/sitemap-category/product/2", + "https://example.com/sitemap-product/1", + "https://example.com/sitemap-product/2", }, ) for extract_args in ({}, {"extract": "product"}) @@ -417,6 +453,15 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]: ) for extract_args in ({}, {"extract": "product"}) ), + # automatic = direct_item for product-like URLs + ( + { + "url": "https://example.com/product/1", + }, + { + "https://example.com/product/1", + }, + ), *( ( { @@ -430,6 +475,7 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]: "https://example.com/category/1", "https://example.com/category/1/page/2", "https://example.com/non-navigation", + "https://example.com/sitemap-category", }, ) for crawl_strategy_args in ({}, {"crawl_strategy": "automatic"}) @@ -460,6 +506,7 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]: "https://example.com/category/1", "https://example.com/category/1/page/2", "https://example.com/non-navigation", + "https://example.com/sitemap-category", }, ), ( @@ -472,6 +519,7 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]: "https://example.com/category/1", "https://example.com/category/1/page/2", "https://example.com/non-navigation", + "https://example.com/sitemap-category", }, ), ( @@ -1046,13 +1094,17 @@ def test_urls(caplog): crawler = get_crawler() url = "https://example.com" - spider = EcommerceSpider.from_crawler(crawler, urls=[url]) + spider = EcommerceSpider.from_crawler( + crawler, urls=[url], crawl_strategy="navigation" + ) start_requests = list(spider.start_requests()) assert len(start_requests) == 1 assert start_requests[0].url == url assert start_requests[0].callback == spider.parse_navigation - spider = EcommerceSpider.from_crawler(crawler, urls=url) + spider = EcommerceSpider.from_crawler( + crawler, urls=url, crawl_strategy="navigation" + ) start_requests = list(spider.start_requests()) assert len(start_requests) == 1 assert start_requests[0].url == url @@ -1062,6 +1114,7 @@ def test_urls(caplog): spider = EcommerceSpider.from_crawler( crawler, urls="https://a.example\n \nhttps://b.example\nhttps://c.example\nfoo\n\n", + crawl_strategy="navigation", ) assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text start_requests = list(spider.start_requests()) @@ -1093,7 +1146,9 @@ def test_urls_file(): b"https://a.example\n \nhttps://b.example\nhttps://c.example\n\n" ) mock_get.return_value = response - spider = EcommerceSpider.from_crawler(crawler, urls_file=url) + spider = EcommerceSpider.from_crawler( + crawler, urls_file=url, crawl_strategy="navigation" + ) mock_get.assert_called_with(url) start_requests = list(spider.start_requests()) @@ -1205,3 +1260,182 @@ def test_modify_page_params_for_heuristics(crawl_strategy, expected_page_params) ) page_params = spider._modify_page_params_for_heuristics(page_params) assert page_params == expected_page_params + + +@pytest.mark.parametrize( + "tree,output", + ( + ( + {"sitemap.xml": ["product/1"]}, + {"product/1"}, + ), + ( + {"blog-sitemap.xml": ["product/1"]}, + set(), + ), + ( + { + "sitemap.xml": { + "news-sitemap.xml": ["product/1"], + "foo-sitemap.xml": ["product/2"], + }, + }, + {"product/2"}, + ), + ( + { + "sitemap.xml": ["a"], + }, + {"a/p"}, + ), + ( + { + "listing.xml": ["a"], + }, + {"a/p"}, + ), + ( + { + "products.xml": ["a"], + }, + {"a"}, + ), + ( + { + "foo.xml": {"products.xml": ["a"]}, + }, + {"a"}, + ), + ( + { + "category.xml": {"products.xml": ["a"]}, + }, + {"a"}, + ), + ( + { + "products.xml": {"foo.xml": ["a"]}, + }, + {"a"}, + ), + ( + { + "products.xml": {"category.xml": ["a"]}, + }, + {"a"}, + ), + ), +) +@ensureDeferred +async def test_sitemap_filtering(tree, output, mockserver): + root_dir = Path(__file__).parent / "fs.example" + subdir_name = "".join(choice(ascii_lowercase) for _ in range(7)) + base_url = f"https://{subdir_name}.fs.example" + subdir = root_dir / subdir_name + subdir.mkdir(parents=True, exist_ok=True) + + def write_sitemap(sitemap, paths): + urls = "".join(f"{base_url}/{path}" for path in paths) + body = f""" + + {urls} + """ + with (subdir / sitemap).open("w") as f: + f.write(body) + + try: + with (subdir / "robots.txt").open("w") as f: + f.write("\n".join(f"sitemap: {base_url}/{sitemap}" for sitemap in tree)) + for sitemap, content in tree.items(): + if isinstance(content, list): + write_sitemap(sitemap, content) + else: + assert isinstance(content, dict) + urls = "".join( + f"{base_url}/{nested_sitemap}" + for nested_sitemap in content + ) + body = f""" + + {urls} + """ + with (subdir / sitemap).open("w") as f: + f.write(body) + for nested_sitemap, urls in content.items(): + assert isinstance(urls, list) + write_sitemap(nested_sitemap, urls) + + settings = { + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_KEY": "a", + "ADDONS": {"scrapy_zyte_api.Addon": 500}, + } + crawler = get_crawler(settings=settings, spider_cls=EcommerceSpider) + actual_output = set() + + def track_item(item, response, spider): + actual_output.add(urlparse(item.url).path.lstrip("/")) + + crawler.signals.connect(track_item, signal=signals.item_scraped) + await crawler.crawl(url=base_url) + assert actual_output == output + + finally: + rmtree(subdir) + + +@pytest.mark.parametrize( + "args,robotstxt_urls", + ( + ( + {"url": "https://example.com"}, + {"https://example.com/robots.txt"}, + ), + # A trailing / is not an issue. + ( + {"url": "https://example.com/"}, + {"https://example.com/robots.txt"}, + ), + # The URL scheme is respected. + ( + {"url": "http://example.com"}, + {"http://example.com/robots.txt"}, + ), + # There is de-duplication. HTTPS is preferred where both are seen in + # input URLs. Order does not matter. + # http, https → https + ( + {"urls": ["http://example.com", "https://example.com"]}, + {"https://example.com/robots.txt"}, + ), + # https, http → https + ( + {"urls": ["https://example.com", "http://example.com"]}, + {"https://example.com/robots.txt"}, + ), + # When using crawl_strategy=auto, only domains of homepages are + # targeted. When crawl_strategy=full, all domains are targeted. + ( + {"urls": ["https://a.example", "https://b.example/foo"]}, + {"https://a.example/robots.txt"}, + ), + ( + { + "urls": ["https://a.example", "https://b.example/foo"], + "crawl_strategy": "full", + }, + {"https://a.example/robots.txt", "https://b.example/robots.txt"}, + ), + ), +) +def test_robotstxt_urls(args, robotstxt_urls): + spider = EcommerceSpider.from_crawler(get_crawler(), **args) + requests = list(spider.start_requests()) + start_urls = args.get("urls") or [args["url"]] + actual_robotstxt_url_list = [ + request.url for request in requests if request.url not in start_urls + ] + count = len(actual_robotstxt_url_list) + actual_robotstxt_urls = set(actual_robotstxt_url_list) + assert len(actual_robotstxt_urls) == count + assert actual_robotstxt_urls == robotstxt_urls diff --git a/tox.ini b/tox.ini index 2a65da7..9ab5406 100644 --- a/tox.ini +++ b/tox.ini @@ -27,6 +27,7 @@ deps = form2request==0.2.0 formasaurus==0.10.0 jmespath==0.9.5 + protego==0.3.0 pydantic==2.1 requests==2.31.0 scrapinghub==2.4.0 diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index bddd108..739516d 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -13,10 +13,13 @@ Union, cast, ) +from urllib.parse import urlparse import scrapy +from protego import Protego from pydantic import BaseModel, ConfigDict, Field, model_validator from scrapy.crawler import Crawler +from scrapy.utils.sitemap import Sitemap from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args from web_poet.page_inputs.browser import BrowserResponse @@ -57,6 +60,48 @@ from typing_extensions import Self +_SKIP_KEYWORDS = { + "blog", + "news", + "magazine", + "image", + "media", +} + + +def _is_ecommerce_sitemap(url: str) -> bool: + return all(keyword not in url for keyword in _SKIP_KEYWORDS) + + +_is_ecommerce_url = _is_ecommerce_sitemap + + +_PRODUCT_KEYWORDS = { + "product", + "produkt", + "pdp", +} +_NAVIGATION_KEYWORDS = { + "search", + "filter", + "tag", + "cat", + "section", + "listing", +} + + +def _is_product_sitemap(url: str) -> bool: + return ( + not is_homepage(url) + and any(keyword in url for keyword in _PRODUCT_KEYWORDS) + and not any(keyword in url for keyword in _NAVIGATION_KEYWORDS) + ) + + +_is_product_url = _is_product_sitemap + + ItemTV = TypeVar("ItemTV", bound=Item) @@ -290,23 +335,25 @@ def _init_extract_from(self): priority=ARG_SETTING_PRIORITY, ) - def get_start_request(self, url): - callback = ( - self.parse_product - if self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item - and self.args.extract == EcommerceExtract.product - else self.parse_navigation + def get_start_request(self, url: str) -> scrapy.Request: + targets_product = self.args.extract == EcommerceExtract.product and ( + self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item + or ( + self.args.crawl_strategy == EcommerceCrawlStrategy.automatic + and _is_product_url(url) + ) ) + callback = self.parse_product if targets_product else self.parse_navigation meta: Dict[str, Any] = { "crawling_logs": { "page_type": self.args.extract.value if self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item + or targets_product else "productNavigation" }, } if ( - self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item - or self.args.extract == EcommerceExtract.productList + targets_product or self.args.extract == EcommerceExtract.productList ) and self._custom_attrs_dep: meta.setdefault("inject", []).append(self._custom_attrs_dep) if self.args.extract == EcommerceExtract.productList: @@ -347,10 +394,126 @@ def start_requests(self) -> Iterable[scrapy.Request]: callback=self.parse_search_request_template, meta=meta, ) - else: - for url in self.start_urls: + return + yield from self.robotstxt_requests() + for url in self.start_urls: + with self._log_request_exception: + yield self.get_start_request(url) + + def robotstxt_requests(self) -> Iterable[scrapy.Request]: + if self.args.crawl_strategy not in { + EcommerceCrawlStrategy.full, + EcommerceCrawlStrategy.automatic, + }: + return + domains: dict[str, str] = {} # domain: scheme, e.g. {"example.com": "https"} + for url in self.start_urls: + if ( + self.args.crawl_strategy == EcommerceCrawlStrategy.automatic + and not is_homepage(url) + ): + continue + parsed_url = urlparse(url) + domain = parsed_url.netloc + scheme = parsed_url.scheme + if domain not in domains: + domains[domain] = scheme + elif scheme == "https": + # If both http and https URLs are found for the same domain, + # prefer https. + domains[domain] = scheme + for domain, scheme in domains.items(): + with self._log_request_exception: + yield scrapy.Request( + url=f"{scheme}://{domain}/robots.txt", + callback=self.parse_robotstxt, + meta={ + "crawling_logs": { + "page_type": "robots.txt", + }, + }, + ) + + def get_sitemap_request( + self, url: str, likely_leaf_type: str | None = None + ) -> scrapy.Request: + return scrapy.Request( + url=url, + callback=self.parse_sitemap, + cb_kwargs={ + "likely_leaf_type": likely_leaf_type, + }, + meta={ + "crawling_logs": { + "page_type": "sitemap", + }, + }, + ) + + def parse_robotstxt(self, response) -> Iterable[scrapy.Request]: + rp = Protego.parse(response.body.decode()) + for url in rp.sitemaps: + if _is_ecommerce_sitemap(url): with self._log_request_exception: - yield self.get_start_request(url) + yield self.get_sitemap_request(url) + + def parse_sitemap( + self, response, likely_leaf_type: str | None = None + ) -> Iterable[scrapy.Request]: + if not likely_leaf_type and _is_product_sitemap(response.url): + likely_leaf_type = "product" + sitemap = Sitemap(response.body) + if sitemap.type == "sitemapindex": + for entry in sitemap: + url = entry["loc"] + if _is_ecommerce_sitemap(url): + with self._log_request_exception: + yield self.get_sitemap_request(url, likely_leaf_type) + elif sitemap.type == "urlset": + for entry in sitemap: + url = entry["loc"] + if not _is_ecommerce_url(url): + continue + if likely_leaf_type == "product" or _is_product_url(url): + if self.args.extract != EcommerceExtract.product: + continue + with self._log_request_exception: + yield self.get_sitemap_product_request(url) + else: + with self._log_request_exception: + yield self.get_sitemap_navigation_request(url) + + def get_sitemap_product_request(self, url: str) -> scrapy.Request: + meta: dict[str, Any] = { + "crawling_logs": { + "page_type": "product", + }, + "page_params": {"full_domain": get_domain(url)}, + } + if self._custom_attrs_dep: + meta.setdefault("inject", []).append(self._custom_attrs_dep) + return scrapy.Request( + url=url, + callback=self.parse_product, + meta=meta, + ) + + def get_sitemap_navigation_request(self, url: str) -> scrapy.Request: + meta: Dict[str, Any] = { + "crawling_logs": { + "page_type": "productNavigation", + }, + "page_params": {"full_domain": get_domain(url)}, + } + if self.args.extract == EcommerceExtract.productList: + meta.setdefault("inject", []).append(ProductList) + if self._custom_attrs_dep: + meta["inject"].append(self._custom_attrs_dep) + return scrapy.Request( + url=url, + callback=self.parse_navigation, + meta=meta, + ) def parse_search_request_template( self, From e39828ef087b899d3e743dcc03e29de0157b6317 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 8 Jan 2025 10:42:22 +0100 Subject: [PATCH 2/2] Fix Python 9 support --- zyte_spider_templates/spiders/ecommerce.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index 739516d..7038ca9 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -435,7 +435,7 @@ def robotstxt_requests(self) -> Iterable[scrapy.Request]: ) def get_sitemap_request( - self, url: str, likely_leaf_type: str | None = None + self, url: str, likely_leaf_type: Optional[str] = None ) -> scrapy.Request: return scrapy.Request( url=url, @@ -458,7 +458,7 @@ def parse_robotstxt(self, response) -> Iterable[scrapy.Request]: yield self.get_sitemap_request(url) def parse_sitemap( - self, response, likely_leaf_type: str | None = None + self, response, likely_leaf_type: Optional[str] = None ) -> Iterable[scrapy.Request]: if not likely_leaf_type and _is_product_sitemap(response.url): likely_leaf_type = "product"