Merge pull request #52 from zytedata/combined-strategy

Combining `full` and `navigation` strategies
zytedata · Aug 19, 2024 · ca9f214 · ca9f214
2 parents 61d85b2 + 89fad94
commit ca9f214
Show file tree

Hide file tree

Showing 5 changed files with 423 additions and 18 deletions.
diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py
@@ -32,9 +32,9 @@ def test_parameters():
 
     EcommerceSpider(url="https://example.com")
     EcommerceSpider(
-        url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.full
+        url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.automatic
     )
-    EcommerceSpider(url="https://example.com", crawl_strategy="full")
+    EcommerceSpider(url="https://example.com", crawl_strategy="automatic")
 
     with pytest.raises(ValidationError):
         EcommerceSpider(url="https://example.com", crawl_strategy="unknown")
@@ -465,19 +465,30 @@ def test_metadata():
                     "enum": ["httpResponseBody", "browserHtml"],
                 },
                 "crawl_strategy": {
-                    "default": "full",
+                    "default": "automatic",
                     "description": "Determines how the start URL and follow-up URLs are crawled.",
                     "enumMeta": {
+                        "automatic": {
+                            "description": (
+                                "Automatically use the best crawl strategy based on the given "
+                                "URL inputs. If given a homepage URL, it would attempt to crawl "
+                                "as many products it can discover. Otherwise, it attempt to "
+                                "crawl the products on a given page category."
+                            ),
+                            "title": "Automatic",
+                        },
                         "full": {
-                            "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
+                            "description": (
+                                "Follow most links within the domain of URL in an attempt "
+                                "to discover and extract as many products as possible."
+                            ),
                             "title": "Full",
                         },
                         "navigation": {
                             "description": (
-                                "Follow pagination, subcategories, and "
-                                "product detail pages. Pagination Only is a "
-                                "better choice if the target URL does not "
-                                "have subcategories, or if Zyte API is "
+                                "Follow pagination, subcategories, and product detail "
+                                "pages. Pagination Only is a better choice if the target "
+                                "URL does not have subcategories, or if Zyte API is "
                                 "misidentifying some URLs as subcategories."
                             ),
                             "title": "Navigation",
@@ -490,7 +501,7 @@ def test_metadata():
                         },
                     },
                     "title": "Crawl strategy",
-                    "enum": ["full", "navigation", "pagination_only"],
+                    "enum": ["automatic", "full", "navigation", "pagination_only"],
                     "type": "string",
                 },
             },
@@ -727,3 +738,55 @@ def test_urls_file():
     assert start_requests[0].url == "https://a.example"
     assert start_requests[1].url == "https://b.example"
     assert start_requests[2].url == "https://c.example"
+
+
+@pytest.mark.parametrize(
+    "url,has_full_domain",
+    (
+        ("https://example.com", (True, True, False, False)),
+        ("https://example.com/", (True, True, False, False)),
+        ("https://example.com/index.htm", (True, True, False, False)),
+        ("https://example.com/index.html", (True, True, False, False)),
+        ("https://example.com/index.php", (True, True, False, False)),
+        ("https://example.com/home", (True, True, False, False)),
+        ("https://example.com/some/category", (False, True, False, False)),
+        ("https://example.com/some/category?pid=123", (False, True, False, False)),
+    ),
+)
+def test_get_start_request_default_strategy(url, has_full_domain):
+    def assert_meta(has_page_params):
+        meta = {"crawling_logs": {"page_type": "productNavigation"}}
+        if has_page_params:
+            meta["page_params"] = {"full_domain": "example.com"}
+        assert result.meta == meta
+
+    for i, crawl_strategy in enumerate(
+        ["automatic", "full", "navigation", "pagination_only"]
+    ):
+        spider = EcommerceSpider.from_crawler(
+            get_crawler(), url=url, crawl_strategy=crawl_strategy
+        )
+        result = spider.get_start_request(url)
+        assert result.url == url
+        assert result.callback == spider.parse_navigation
+        assert_meta(has_full_domain[i])
+
+
+@pytest.mark.parametrize(
+    "crawl_strategy,expected_page_params",
+    (
+        ("automatic", {}),
+        ("full", {"full_domain": "example.com"}),
+        ("navigation", {}),
+        ("pagination_only", {}),
+    ),
+)
+def test_modify_page_params_for_heuristics(crawl_strategy, expected_page_params):
+    url = "https://example.com"
+    page_params = {"full_domain": "example.com"}
+
+    spider = EcommerceSpider.from_crawler(
+        get_crawler(), url=url, crawl_strategy=crawl_strategy
+    )
+    page_params = spider._modify_page_params_for_heuristics(page_params)
+    assert page_params == expected_page_params
diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py
@@ -1,6 +1,6 @@
 import pytest
 
-from zyte_spider_templates.heuristics import might_be_category
+from zyte_spider_templates.heuristics import is_homepage, might_be_category
 
 
 @pytest.mark.parametrize(
@@ -50,3 +50,64 @@
 )
 def test_might_be_category(test_input, expected):
     assert might_be_category(test_input) == expected
+
+
+LOCALES = (
+    "/us/en",
+    "/en/us",
+    "/us-en",
+    "/us_en",
+    "/AT_en",
+    "/pt-br",
+    "/PT-br",
+    "/en-us",
+    "/en-AT",
+    "/en",
+    "/uk",
+)
+
+
+@pytest.mark.parametrize(
+    "url_path,expected",
+    (
+        ("", True),
+        ("/", True),
+        ("/index", True),
+        ("/index.htm", True),
+        ("/index.html", True),
+        ("/index.php", True),
+        ("/home", True),
+        ("/home/", True),
+        ("?ref=abc", False),
+        ("/some/category", False),
+        ("/some/category?query=2123", False),
+    ),
+)
+@pytest.mark.parametrize("locale", LOCALES)
+def test_is_homepage(locale, url_path, expected):
+    assert is_homepage("https://example.com" + url_path) == expected
+    assert is_homepage("https://example.com" + locale + url_path) == expected
+
+
+@pytest.mark.parametrize(
+    "url",
+    (
+        "https://example.com/zz/dd",
+        "https://example.com/dd/zz",
+        "https://example.com/dd-zz",
+        "https://example.com/dd_zz",
+        "https://example.com/DD_zz",
+        "https://example.com/bb-DD",
+        "https://example.com/DD-BB",
+        "https://example.com/dd-zz",
+        "https://example.com/dd-ZZ",
+        "https://example.com/dd",
+        "https://example.com/zz",
+    ),
+)
+def test_is_homepage_localization_bad(url):
+    """If the url locale pattern doesn't match the country and language codes,
+    then it should not be identified as homepage.
+    """
+    assert not is_homepage(url)
+    assert not is_homepage(url + "/")
diff --git a/zyte_spider_templates/_lang_codes.py b/zyte_spider_templates/_lang_codes.py
@@ -0,0 +1,188 @@
+# ISO 639-1 language codes
+# Taken from https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes
+
+LANG_CODES = [
+    "ab",
+    "aa",
+    "af",
+    "ak",
+    "sq",
+    "am",
+    "ar",
+    "an",
+    "hy",
+    "as",
+    "av",
+    "ae",
+    "ay",
+    "az",
+    "bm",
+    "ba",
+    "eu",
+    "be",
+    "bn",
+    "bi",
+    "bs",
+    "br",
+    "bg",
+    "my",
+    "ca",
+    "ch",
+    "ce",
+    "ny",
+    "zh",
+    "cu",
+    "cv",
+    "kw",
+    "co",
+    "cr",
+    "hr",
+    "cs",
+    "da",
+    "dv",
+    "nl",
+    "dz",
+    "en",
+    "eo",
+    "et",
+    "ee",
+    "fo",
+    "fj",
+    "fi",
+    "fr",
+    "fy",
+    "ff",
+    "gd",
+    "gl",
+    "lg",
+    "ka",
+    "de",
+    "el",
+    "kl",
+    "gn",
+    "gu",
+    "ht",
+    "ha",
+    "he",
+    "hz",
+    "hi",
+    "ho",
+    "hu",
+    "is",
+    "io",
+    "ig",
+    "id",
+    "ia",
+    "ie",
+    "iu",
+    "ik",
+    "ga",
+    "it",
+    "ja",
+    "jv",
+    "kn",
+    "kr",
+    "ks",
+    "kk",
+    "km",
+    "ki",
+    "rw",
+    "ky",
+    "kv",
+    "kg",
+    "ko",
+    "kj",
+    "ku",
+    "lo",
+    "la",
+    "lv",
+    "li",
+    "ln",
+    "lt",
+    "lu",
+    "lb",
+    "mk",
+    "mg",
+    "ms",
+    "ml",
+    "mt",
+    "gv",
+    "mi",
+    "mr",
+    "mh",
+    "mn",
+    "na",
+    "nv",
+    "nd",
+    "nr",
+    "ng",
+    "ne",
+    "no",
+    "nb",
+    "nn",
+    "ii",
+    "oc",
+    "oj",
+    "or",
+    "om",
+    "os",
+    "pi",
+    "ps",
+    "fa",
+    "pl",
+    "pt",
+    "pa",
+    "qu",
+    "ro",
+    "rm",
+    "rn",
+    "ru",
+    "se",
+    "sm",
+    "sg",
+    "sa",
+    "sc",
+    "sr",
+    "sn",
+    "sd",
+    "si",
+    "sk",
+    "sl",
+    "so",
+    "st",
+    "es",
+    "su",
+    "sw",
+    "ss",
+    "sv",
+    "tl",
+    "ty",
+    "tg",
+    "ta",
+    "tt",
+    "te",
+    "th",
+    "bo",
+    "ti",
+    "to",
+    "ts",
+    "tn",
+    "tr",
+    "tk",
+    "tw",
+    "ug",
+    "uk",
+    "ur",
+    "uz",
+    "ve",
+    "vi",
+    "vo",
+    "wa",
+    "cy",
+    "wo",
+    "xh",
+    "yi",
+    "yo",
+    "za",
+    "zu",
+]