From dfb4df0826c1cc1e2e490d2ba9e80081044e734b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Tue, 7 Jan 2025 18:34:39 +0100
Subject: [PATCH 1/2] Basic e-commerce sitemap support

---
 setup.py                                   |   1 +
 tests/.gitignore                           |   1 +
 tests/mockserver.py                        | 114 +++++++++-
 tests/test_ecommerce.py                    | 248 ++++++++++++++++++++-
 tox.ini                                    |   1 +
 zyte_spider_templates/spiders/ecommerce.py | 185 ++++++++++++++-
 6 files changed, 525 insertions(+), 25 deletions(-)
 create mode 100644 tests/.gitignore
diff --git a/setup.py b/setup.py
index 64532cb..0d52461 100644
--- a/setup.py
+++ b/setup.py
@@ -19,6 +19,7 @@
         "form2request>=0.2.0",
         "formasaurus>=0.10.0",
         "jmespath>=0.9.5",
+        "protego>=0.3.0",
         "pydantic>=2.1",
         "requests>=2.31.0",
         "scrapinghub >= 2.4.0",
diff --git a/tests/.gitignore b/tests/.gitignore
new file mode 100644
index 0000000..e6b9a97
--- /dev/null
+++ b/tests/.gitignore
@@ -0,0 +1 @@
+/fs.example/
diff --git a/tests/mockserver.py b/tests/mockserver.py
index fe2ff20..b7afdcf 100644
--- a/tests/mockserver.py
+++ b/tests/mockserver.py
@@ -3,9 +3,12 @@
 import socket
 import sys
 import time
+from base64 import b64encode
 from importlib import import_module
+from pathlib import Path
 from subprocess import PIPE, Popen
 from typing import Any, Dict
+from urllib.parse import urlparse
 
 from scrapy_zyte_api.responses import _API_RESPONSE
 from twisted.internet import reactor
@@ -36,20 +39,30 @@ class DefaultResource(Resource):
         https://example.com/category/1
         https://example.com/category/1/page/2
         https://example.com/non-navigation
+        https://example.com/sitemap-category
+        https://example.com/sitemap-product/1
+        https://example.com/sitemap-product/2
         ```
 
         When browserHtml is requested (for any URL, listed above or not), it is
         a minimal HTML with an anchor tag pointing to
         https://example.com/non-navigation.
 
-        When productNavigation is requested, nextPage and subCategories are filled
-        accordingly. productNavigation.items always has 2 product URLs, which are
-        the result of appending ``/product/<n>`` to the request URL.
-        https://example.com/non-navigation is not reachable through
-        productNavigation.
+        When productNavigation is requested, nextPage and subCategories are
+        filled accordingly. productNavigation.items always has 2 product URLs,
+        which are the result of appending ``/product/<n>`` to the request URL.
 
-        When product or productList is requested, an item with the current URL is
-        always returned.
+        The following pages are not reachable through productNavigation:
+
+        -   https://example.com/non-navigation is in an a element of the HTML
+            of every page.
+
+        -   https://example.com/sitemap-* URLs are linked from
+            https://example.com/sitemap*.xml, which are linked from
+            https://example.com/robots.txt.
+
+        When product or productList is requested, an item with the current URL
+        is always returned.
 
         All output also includes unsupported links (mailto:…).
 
@@ -61,6 +74,16 @@ class DefaultResource(Resource):
         -   https://jobs.offsite.example/jobs/1 (jobPosting)
 
         -   https://jobs.offsite.example/jobs/2 (jobPosting)
+
+    -   For fs.example subdomains, a matching file is looked for in the file
+        system. If found, its content is base64-encoded and returned as
+        httpResponseBody. Else, a product response is returned.
+
+        For example, for the URL https://abcdefg.fs.example/foo, if a file
+        exists at tests/fs.example/abcdefg/foo, its content is returned as
+        httpResponseBody. Otherwise, the response is as empty as possible based
+        on input parameters, except when requesting productNavigation, which
+        always includes an item at <url>/p.
     """
 
     def getChild(self, path, request):
@@ -99,6 +122,82 @@ def render_POST(self, request):
             }
             return json.dumps(response_data).encode()
 
+        if request_data["url"] == "https://example.com/robots.txt":
+            assert request_data["httpResponseBody"] is True
+            body = b"""
+                Sitemap: https://example.com/sitemap.xml  # Link to category
+                SiTeMaP: https://example.com/sitemap-index.xml  # Links to products
+            """
+            response_data["httpResponseBody"] = b64encode(body).decode()
+            return json.dumps(response_data).encode()
+
+        if request_data["url"] == "https://example.com/sitemap.xml":
+            assert request_data["httpResponseBody"] is True
+            body = b"""
+                <?xml version="1.0" encoding="UTF-8"?>
+                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+                    <url>
+                        <loc>https://example.com/sitemap-category</loc>
+                    </url>
+                </urlset>
+            """
+            response_data["httpResponseBody"] = b64encode(body).decode()
+            return json.dumps(response_data).encode()
+
+        if request_data["url"] == "https://example.com/sitemap-index.xml":
+            assert request_data["httpResponseBody"] is True
+            body = b"""
+                <?xml version="1.0" encoding="UTF-8"?>
+                <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+                    <sitemap>
+                        <loc>https://example.com/sitemap-products.xml</loc>
+                    </sitemap>
+                </sitemapindex>
+            """
+            response_data["httpResponseBody"] = b64encode(body).decode()
+            return json.dumps(response_data).encode()
+
+        if request_data["url"] == "https://example.com/sitemap-products.xml":
+            assert request_data["httpResponseBody"] is True
+            body = b"""
+                <?xml version="1.0" encoding="UTF-8"?>
+                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+                    <url>
+                        <loc>https://example.com/sitemap-product/1</loc>
+                    </url>
+                    <url>
+                        <loc>https://example.com/sitemap-product/2</loc>
+                    </url>
+                </urlset>
+            """
+            response_data["httpResponseBody"] = b64encode(body).decode()
+            return json.dumps(response_data).encode()
+
+        if "fs.example" in request_data["url"]:
+            parsed_url = urlparse(request_data["url"])
+            subdir_name = parsed_url.netloc[: -len(".fs.example")]
+            root_dir = Path(__file__).parent / "fs.example"
+            subdir = root_dir / subdir_name
+            filepath = subdir / parsed_url.path.lstrip("/")
+            if filepath != subdir and filepath.exists():
+                response_data["httpResponseBody"] = b64encode(
+                    filepath.read_bytes()
+                ).decode()
+            else:
+                if request_data.get("browserHtml", False) is True:
+                    response_data["browserHtml"] = "<!doctype html><title>a</title>"
+                if request_data.get("product", False) is True:
+                    response_data["product"] = {"url": response_data["url"]}
+                if request_data.get("productNavigation", False) is True:
+                    items = []
+                    if filepath != subdir:
+                        items = [{"url": f"{request_data['url'].rstrip('/')}/p"}]
+                    response_data["productNavigation"] = {
+                        "url": response_data["url"],
+                        "items": items,
+                    }
+            return json.dumps(response_data).encode()
+
         non_navigation_url = "https://example.com/non-navigation"
         html = f"""<html><body><a href="{non_navigation_url}"></a><a href="mailto:jane@example.com"></a></body></html>"""
         if request_data.get("browserHtml", False) is True:
@@ -119,6 +218,7 @@ def render_POST(self, request):
             if (
                 "/page/" not in request_data["url"]
                 and "/non-navigation" not in request_data["url"]
+                and "/sitemap" not in request_data["url"]
             ):
                 kwargs["nextPage"] = {
                     "url": f"{request_data['url'].rstrip('/')}/page/2"
diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py
index b64fcb4..6f40606 100644
--- a/tests/test_ecommerce.py
+++ b/tests/test_ecommerce.py
@@ -1,6 +1,11 @@
 import logging
+from pathlib import Path
+from random import choice
+from shutil import rmtree
+from string import ascii_lowercase
 from typing import Iterable, List, cast
 from unittest.mock import MagicMock, call, patch
+from urllib.parse import urlparse
 
 import pytest
 import requests
@@ -35,9 +40,11 @@ def test_start_requests():
     crawler = get_crawler()
     spider = EcommerceSpider.from_crawler(crawler, url=url)
     requests = list(spider.start_requests())
-    assert len(requests) == 1
-    assert requests[0].url == url
-    assert requests[0].callback == spider.parse_navigation
+    assert len(requests) == 2
+    assert requests[0].url == "https://example.com/robots.txt"
+    assert requests[0].callback == spider.parse_robotstxt
+    assert requests[1].url == url
+    assert requests[1].callback == spider.parse_navigation
 
 
 def test_start_requests_crawling_logs_page_type():
@@ -46,7 +53,8 @@ def test_start_requests_crawling_logs_page_type():
 
     spider = EcommerceSpider.from_crawler(crawler, url=url)
     requests = list(spider.start_requests())
-    assert requests[0].meta["crawling_logs"]["page_type"] == "productNavigation"
+    assert requests[0].meta["crawling_logs"]["page_type"] == "robots.txt"
+    assert requests[1].meta["crawling_logs"]["page_type"] == "productNavigation"
 
     spider = EcommerceSpider.from_crawler(
         crawler, url=url, crawl_strategy="direct_item"
@@ -261,6 +269,10 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]:
                     "https://example.com/category/1/page/2/product/2",
                     "https://example.com/non-navigation/product/1",
                     "https://example.com/non-navigation/product/2",
+                    "https://example.com/sitemap-category/product/1",
+                    "https://example.com/sitemap-category/product/2",
+                    "https://example.com/sitemap-product/1",
+                    "https://example.com/sitemap-product/2",
                 },
             )
             for crawl_strategy_args in ({}, {"crawl_strategy": "automatic"})
@@ -283,6 +295,22 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]:
             for crawl_strategy_args in ({}, {"crawl_strategy": "automatic"})
             for extract_args in ({}, {"extract": "product"})
         ),
+        # automatic works like direct_item for product-like URLs when
+        # extracting products.
+        *(
+            (
+                {
+                    "url": "https://example.com/product/1",
+                    **crawl_strategy_args,
+                    **extract_args,
+                },
+                {
+                    "https://example.com/product/1",
+                },
+            )
+            for crawl_strategy_args in ({}, {"crawl_strategy": "automatic"})
+            for extract_args in ({}, {"extract": "product"})
+        ),
         *(
             (
                 {
@@ -301,6 +329,10 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]:
                     "https://example.com/category/1/page/2/product/2",
                     "https://example.com/non-navigation/product/1",
                     "https://example.com/non-navigation/product/2",
+                    "https://example.com/sitemap-category/product/1",
+                    "https://example.com/sitemap-category/product/2",
+                    "https://example.com/sitemap-product/1",
+                    "https://example.com/sitemap-product/2",
                 },
             )
             for extract_args in ({}, {"extract": "product"})
@@ -319,6 +351,10 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]:
                     "https://example.com/category/1/page/2/product/2",
                     "https://example.com/non-navigation/product/1",
                     "https://example.com/non-navigation/product/2",
+                    "https://example.com/sitemap-category/product/1",
+                    "https://example.com/sitemap-category/product/2",
+                    "https://example.com/sitemap-product/1",
+                    "https://example.com/sitemap-product/2",
                 },
             )
             for extract_args in ({}, {"extract": "product"})
@@ -417,6 +453,15 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]:
             )
             for extract_args in ({}, {"extract": "product"})
         ),
+        # automatic = direct_item for product-like URLs
+        (
+            {
+                "url": "https://example.com/product/1",
+            },
+            {
+                "https://example.com/product/1",
+            },
+        ),
         *(
             (
                 {
@@ -430,6 +475,7 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]:
                     "https://example.com/category/1",
                     "https://example.com/category/1/page/2",
                     "https://example.com/non-navigation",
+                    "https://example.com/sitemap-category",
                 },
             )
             for crawl_strategy_args in ({}, {"crawl_strategy": "automatic"})
@@ -460,6 +506,7 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]:
                 "https://example.com/category/1",
                 "https://example.com/category/1/page/2",
                 "https://example.com/non-navigation",
+                "https://example.com/sitemap-category",
             },
         ),
         (
@@ -472,6 +519,7 @@ def _get_requests(navigation: ProductNavigation) -> List[scrapy.Request]:
                 "https://example.com/category/1",
                 "https://example.com/category/1/page/2",
                 "https://example.com/non-navigation",
+                "https://example.com/sitemap-category",
             },
         ),
         (
@@ -1046,13 +1094,17 @@ def test_urls(caplog):
     crawler = get_crawler()
     url = "https://example.com"
 
-    spider = EcommerceSpider.from_crawler(crawler, urls=[url])
+    spider = EcommerceSpider.from_crawler(
+        crawler, urls=[url], crawl_strategy="navigation"
+    )
     start_requests = list(spider.start_requests())
     assert len(start_requests) == 1
     assert start_requests[0].url == url
     assert start_requests[0].callback == spider.parse_navigation
 
-    spider = EcommerceSpider.from_crawler(crawler, urls=url)
+    spider = EcommerceSpider.from_crawler(
+        crawler, urls=url, crawl_strategy="navigation"
+    )
     start_requests = list(spider.start_requests())
     assert len(start_requests) == 1
     assert start_requests[0].url == url
@@ -1062,6 +1114,7 @@ def test_urls(caplog):
     spider = EcommerceSpider.from_crawler(
         crawler,
         urls="https://a.example\n \nhttps://b.example\nhttps://c.example\nfoo\n\n",
+        crawl_strategy="navigation",
     )
     assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text
     start_requests = list(spider.start_requests())
@@ -1093,7 +1146,9 @@ def test_urls_file():
             b"https://a.example\n \nhttps://b.example\nhttps://c.example\n\n"
         )
         mock_get.return_value = response
-        spider = EcommerceSpider.from_crawler(crawler, urls_file=url)
+        spider = EcommerceSpider.from_crawler(
+            crawler, urls_file=url, crawl_strategy="navigation"
+        )
         mock_get.assert_called_with(url)
 
     start_requests = list(spider.start_requests())
@@ -1205,3 +1260,182 @@ def test_modify_page_params_for_heuristics(crawl_strategy, expected_page_params)
     )
     page_params = spider._modify_page_params_for_heuristics(page_params)
     assert page_params == expected_page_params
+
+
+@pytest.mark.parametrize(
+    "tree,output",
+    (
+        (
+            {"sitemap.xml": ["product/1"]},
+            {"product/1"},
+        ),
+        (
+            {"blog-sitemap.xml": ["product/1"]},
+            set(),
+        ),
+        (
+            {
+                "sitemap.xml": {
+                    "news-sitemap.xml": ["product/1"],
+                    "foo-sitemap.xml": ["product/2"],
+                },
+            },
+            {"product/2"},
+        ),
+        (
+            {
+                "sitemap.xml": ["a"],
+            },
+            {"a/p"},
+        ),
+        (
+            {
+                "listing.xml": ["a"],
+            },
+            {"a/p"},
+        ),
+        (
+            {
+                "products.xml": ["a"],
+            },
+            {"a"},
+        ),
+        (
+            {
+                "foo.xml": {"products.xml": ["a"]},
+            },
+            {"a"},
+        ),
+        (
+            {
+                "category.xml": {"products.xml": ["a"]},
+            },
+            {"a"},
+        ),
+        (
+            {
+                "products.xml": {"foo.xml": ["a"]},
+            },
+            {"a"},
+        ),
+        (
+            {
+                "products.xml": {"category.xml": ["a"]},
+            },
+            {"a"},
+        ),
+    ),
+)
+@ensureDeferred
+async def test_sitemap_filtering(tree, output, mockserver):
+    root_dir = Path(__file__).parent / "fs.example"
+    subdir_name = "".join(choice(ascii_lowercase) for _ in range(7))
+    base_url = f"https://{subdir_name}.fs.example"
+    subdir = root_dir / subdir_name
+    subdir.mkdir(parents=True, exist_ok=True)
+
+    def write_sitemap(sitemap, paths):
+        urls = "".join(f"<url><loc>{base_url}/{path}</loc></url>" for path in paths)
+        body = f"""
+            <?xml version="1.0" encoding="UTF-8"?>
+            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">{urls}</urlset>
+        """
+        with (subdir / sitemap).open("w") as f:
+            f.write(body)
+
+    try:
+        with (subdir / "robots.txt").open("w") as f:
+            f.write("\n".join(f"sitemap: {base_url}/{sitemap}" for sitemap in tree))
+        for sitemap, content in tree.items():
+            if isinstance(content, list):
+                write_sitemap(sitemap, content)
+            else:
+                assert isinstance(content, dict)
+                urls = "".join(
+                    f"<sitemap><loc>{base_url}/{nested_sitemap}</loc></sitemap>"
+                    for nested_sitemap in content
+                )
+                body = f"""
+                    <?xml version="1.0" encoding="UTF-8"?>
+                    <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">{urls}</sitemapindex>
+                """
+                with (subdir / sitemap).open("w") as f:
+                    f.write(body)
+                for nested_sitemap, urls in content.items():
+                    assert isinstance(urls, list)
+                    write_sitemap(nested_sitemap, urls)
+
+        settings = {
+            "ZYTE_API_URL": mockserver.urljoin("/"),
+            "ZYTE_API_KEY": "a",
+            "ADDONS": {"scrapy_zyte_api.Addon": 500},
+        }
+        crawler = get_crawler(settings=settings, spider_cls=EcommerceSpider)
+        actual_output = set()
+
+        def track_item(item, response, spider):
+            actual_output.add(urlparse(item.url).path.lstrip("/"))
+
+        crawler.signals.connect(track_item, signal=signals.item_scraped)
+        await crawler.crawl(url=base_url)
+        assert actual_output == output
+
+    finally:
+        rmtree(subdir)
+
+
+@pytest.mark.parametrize(
+    "args,robotstxt_urls",
+    (
+        (
+            {"url": "https://example.com"},
+            {"https://example.com/robots.txt"},
+        ),
+        # A trailing / is not an issue.
+        (
+            {"url": "https://example.com/"},
+            {"https://example.com/robots.txt"},
+        ),
+        # The URL scheme is respected.
+        (
+            {"url": "http://example.com"},
+            {"http://example.com/robots.txt"},
+        ),
+        # There is de-duplication. HTTPS is preferred where both are seen in
+        # input URLs. Order does not matter.
+        # http, https → https
+        (
+            {"urls": ["http://example.com", "https://example.com"]},
+            {"https://example.com/robots.txt"},
+        ),
+        # https, http → https
+        (
+            {"urls": ["https://example.com", "http://example.com"]},
+            {"https://example.com/robots.txt"},
+        ),
+        # When using crawl_strategy=auto, only domains of homepages are
+        # targeted. When crawl_strategy=full, all domains are targeted.
+        (
+            {"urls": ["https://a.example", "https://b.example/foo"]},
+            {"https://a.example/robots.txt"},
+        ),
+        (
+            {
+                "urls": ["https://a.example", "https://b.example/foo"],
+                "crawl_strategy": "full",
+            },
+            {"https://a.example/robots.txt", "https://b.example/robots.txt"},
+        ),
+    ),
+)
+def test_robotstxt_urls(args, robotstxt_urls):
+    spider = EcommerceSpider.from_crawler(get_crawler(), **args)
+    requests = list(spider.start_requests())
+    start_urls = args.get("urls") or [args["url"]]
+    actual_robotstxt_url_list = [
+        request.url for request in requests if request.url not in start_urls
+    ]
+    count = len(actual_robotstxt_url_list)
+    actual_robotstxt_urls = set(actual_robotstxt_url_list)
+    assert len(actual_robotstxt_urls) == count
+    assert actual_robotstxt_urls == robotstxt_urls
diff --git a/tox.ini b/tox.ini
index 2a65da7..9ab5406 100644
--- a/tox.ini
+++ b/tox.ini
@@ -27,6 +27,7 @@ deps =
     form2request==0.2.0
     formasaurus==0.10.0
     jmespath==0.9.5
+    protego==0.3.0
     pydantic==2.1
     requests==2.31.0
     scrapinghub==2.4.0
diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py
index bddd108..739516d 100644
--- a/zyte_spider_templates/spiders/ecommerce.py
+++ b/zyte_spider_templates/spiders/ecommerce.py
@@ -13,10 +13,13 @@
     Union,
     cast,
 )
+from urllib.parse import urlparse
 
 import scrapy
+from protego import Protego
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 from scrapy.crawler import Crawler
+from scrapy.utils.sitemap import Sitemap
 from scrapy_poet import DummyResponse, DynamicDeps
 from scrapy_spider_metadata import Args
 from web_poet.page_inputs.browser import BrowserResponse
@@ -57,6 +60,48 @@
     from typing_extensions import Self
 
 
+_SKIP_KEYWORDS = {
+    "blog",
+    "news",
+    "magazine",
+    "image",
+    "media",
+}
+
+
+def _is_ecommerce_sitemap(url: str) -> bool:
+    return all(keyword not in url for keyword in _SKIP_KEYWORDS)
+
+
+_is_ecommerce_url = _is_ecommerce_sitemap
+
+
+_PRODUCT_KEYWORDS = {
+    "product",
+    "produkt",
+    "pdp",
+}
+_NAVIGATION_KEYWORDS = {
+    "search",
+    "filter",
+    "tag",
+    "cat",
+    "section",
+    "listing",
+}
+
+
+def _is_product_sitemap(url: str) -> bool:
+    return (
+        not is_homepage(url)
+        and any(keyword in url for keyword in _PRODUCT_KEYWORDS)
+        and not any(keyword in url for keyword in _NAVIGATION_KEYWORDS)
+    )
+
+
+_is_product_url = _is_product_sitemap
+
+
 ItemTV = TypeVar("ItemTV", bound=Item)
 
 
@@ -290,23 +335,25 @@ def _init_extract_from(self):
                 priority=ARG_SETTING_PRIORITY,
             )
 
-    def get_start_request(self, url):
-        callback = (
-            self.parse_product
-            if self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item
-            and self.args.extract == EcommerceExtract.product
-            else self.parse_navigation
+    def get_start_request(self, url: str) -> scrapy.Request:
+        targets_product = self.args.extract == EcommerceExtract.product and (
+            self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item
+            or (
+                self.args.crawl_strategy == EcommerceCrawlStrategy.automatic
+                and _is_product_url(url)
+            )
         )
+        callback = self.parse_product if targets_product else self.parse_navigation
         meta: Dict[str, Any] = {
             "crawling_logs": {
                 "page_type": self.args.extract.value
                 if self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item
+                or targets_product
                 else "productNavigation"
             },
         }
         if (
-            self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item
-            or self.args.extract == EcommerceExtract.productList
+            targets_product or self.args.extract == EcommerceExtract.productList
         ) and self._custom_attrs_dep:
             meta.setdefault("inject", []).append(self._custom_attrs_dep)
         if self.args.extract == EcommerceExtract.productList:
@@ -347,10 +394,126 @@ def start_requests(self) -> Iterable[scrapy.Request]:
                         callback=self.parse_search_request_template,
                         meta=meta,
                     )
-        else:
-            for url in self.start_urls:
+            return
+        yield from self.robotstxt_requests()
+        for url in self.start_urls:
+            with self._log_request_exception:
+                yield self.get_start_request(url)
+
+    def robotstxt_requests(self) -> Iterable[scrapy.Request]:
+        if self.args.crawl_strategy not in {
+            EcommerceCrawlStrategy.full,
+            EcommerceCrawlStrategy.automatic,
+        }:
+            return
+        domains: dict[str, str] = {}  # domain: scheme, e.g. {"example.com": "https"}
+        for url in self.start_urls:
+            if (
+                self.args.crawl_strategy == EcommerceCrawlStrategy.automatic
+                and not is_homepage(url)
+            ):
+                continue
+            parsed_url = urlparse(url)
+            domain = parsed_url.netloc
+            scheme = parsed_url.scheme
+            if domain not in domains:
+                domains[domain] = scheme
+            elif scheme == "https":
+                # If both http and https URLs are found for the same domain,
+                # prefer https.
+                domains[domain] = scheme
+        for domain, scheme in domains.items():
+            with self._log_request_exception:
+                yield scrapy.Request(
+                    url=f"{scheme}://{domain}/robots.txt",
+                    callback=self.parse_robotstxt,
+                    meta={
+                        "crawling_logs": {
+                            "page_type": "robots.txt",
+                        },
+                    },
+                )
+
+    def get_sitemap_request(
+        self, url: str, likely_leaf_type: str | None = None
+    ) -> scrapy.Request:
+        return scrapy.Request(
+            url=url,
+            callback=self.parse_sitemap,
+            cb_kwargs={
+                "likely_leaf_type": likely_leaf_type,
+            },
+            meta={
+                "crawling_logs": {
+                    "page_type": "sitemap",
+                },
+            },
+        )
+
+    def parse_robotstxt(self, response) -> Iterable[scrapy.Request]:
+        rp = Protego.parse(response.body.decode())
+        for url in rp.sitemaps:
+            if _is_ecommerce_sitemap(url):
                 with self._log_request_exception:
-                    yield self.get_start_request(url)
+                    yield self.get_sitemap_request(url)
+
+    def parse_sitemap(
+        self, response, likely_leaf_type: str | None = None
+    ) -> Iterable[scrapy.Request]:
+        if not likely_leaf_type and _is_product_sitemap(response.url):
+            likely_leaf_type = "product"
+        sitemap = Sitemap(response.body)
+        if sitemap.type == "sitemapindex":
+            for entry in sitemap:
+                url = entry["loc"]
+                if _is_ecommerce_sitemap(url):
+                    with self._log_request_exception:
+                        yield self.get_sitemap_request(url, likely_leaf_type)
+        elif sitemap.type == "urlset":
+            for entry in sitemap:
+                url = entry["loc"]
+                if not _is_ecommerce_url(url):
+                    continue
+                if likely_leaf_type == "product" or _is_product_url(url):
+                    if self.args.extract != EcommerceExtract.product:
+                        continue
+                    with self._log_request_exception:
+                        yield self.get_sitemap_product_request(url)
+                else:
+                    with self._log_request_exception:
+                        yield self.get_sitemap_navigation_request(url)
+
+    def get_sitemap_product_request(self, url: str) -> scrapy.Request:
+        meta: dict[str, Any] = {
+            "crawling_logs": {
+                "page_type": "product",
+            },
+            "page_params": {"full_domain": get_domain(url)},
+        }
+        if self._custom_attrs_dep:
+            meta.setdefault("inject", []).append(self._custom_attrs_dep)
+        return scrapy.Request(
+            url=url,
+            callback=self.parse_product,
+            meta=meta,
+        )
+
+    def get_sitemap_navigation_request(self, url: str) -> scrapy.Request:
+        meta: Dict[str, Any] = {
+            "crawling_logs": {
+                "page_type": "productNavigation",
+            },
+            "page_params": {"full_domain": get_domain(url)},
+        }
+        if self.args.extract == EcommerceExtract.productList:
+            meta.setdefault("inject", []).append(ProductList)
+            if self._custom_attrs_dep:
+                meta["inject"].append(self._custom_attrs_dep)
+        return scrapy.Request(
+            url=url,
+            callback=self.parse_navigation,
+            meta=meta,
+        )
 
     def parse_search_request_template(
         self,

From e39828ef087b899d3e743dcc03e29de0157b6317 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Wed, 8 Jan 2025 10:42:22 +0100
Subject: [PATCH 2/2] Fix Python 9 support

---
 zyte_spider_templates/spiders/ecommerce.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py
index 739516d..7038ca9 100644
--- a/zyte_spider_templates/spiders/ecommerce.py
+++ b/zyte_spider_templates/spiders/ecommerce.py
@@ -435,7 +435,7 @@ def robotstxt_requests(self) -> Iterable[scrapy.Request]:
                 )
 
     def get_sitemap_request(
-        self, url: str, likely_leaf_type: str | None = None
+        self, url: str, likely_leaf_type: Optional[str] = None
     ) -> scrapy.Request:
         return scrapy.Request(
             url=url,
@@ -458,7 +458,7 @@ def parse_robotstxt(self, response) -> Iterable[scrapy.Request]:
                     yield self.get_sitemap_request(url)
 
     def parse_sitemap(
-        self, response, likely_leaf_type: str | None = None
+        self, response, likely_leaf_type: Optional[str] = None
     ) -> Iterable[scrapy.Request]:
         if not likely_leaf_type and _is_product_sitemap(response.url):
             likely_leaf_type = "product"