Skip to content

Commit

Permalink
Merge pull request #52 from zytedata/combined-strategy
Browse files Browse the repository at this point in the history
Combining `full` and `navigation` strategies
  • Loading branch information
kmike authored Aug 19, 2024
2 parents 61d85b2 + 89fad94 commit ca9f214
Show file tree
Hide file tree
Showing 5 changed files with 423 additions and 18 deletions.
81 changes: 72 additions & 9 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ def test_parameters():

EcommerceSpider(url="https://example.com")
EcommerceSpider(
url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.full
url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.automatic
)
EcommerceSpider(url="https://example.com", crawl_strategy="full")
EcommerceSpider(url="https://example.com", crawl_strategy="automatic")

with pytest.raises(ValidationError):
EcommerceSpider(url="https://example.com", crawl_strategy="unknown")
Expand Down Expand Up @@ -465,19 +465,30 @@ def test_metadata():
"enum": ["httpResponseBody", "browserHtml"],
},
"crawl_strategy": {
"default": "full",
"default": "automatic",
"description": "Determines how the start URL and follow-up URLs are crawled.",
"enumMeta": {
"automatic": {
"description": (
"Automatically use the best crawl strategy based on the given "
"URL inputs. If given a homepage URL, it would attempt to crawl "
"as many products it can discover. Otherwise, it attempt to "
"crawl the products on a given page category."
),
"title": "Automatic",
},
"full": {
"description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
"description": (
"Follow most links within the domain of URL in an attempt "
"to discover and extract as many products as possible."
),
"title": "Full",
},
"navigation": {
"description": (
"Follow pagination, subcategories, and "
"product detail pages. Pagination Only is a "
"better choice if the target URL does not "
"have subcategories, or if Zyte API is "
"Follow pagination, subcategories, and product detail "
"pages. Pagination Only is a better choice if the target "
"URL does not have subcategories, or if Zyte API is "
"misidentifying some URLs as subcategories."
),
"title": "Navigation",
Expand All @@ -490,7 +501,7 @@ def test_metadata():
},
},
"title": "Crawl strategy",
"enum": ["full", "navigation", "pagination_only"],
"enum": ["automatic", "full", "navigation", "pagination_only"],
"type": "string",
},
},
Expand Down Expand Up @@ -727,3 +738,55 @@ def test_urls_file():
assert start_requests[0].url == "https://a.example"
assert start_requests[1].url == "https://b.example"
assert start_requests[2].url == "https://c.example"


@pytest.mark.parametrize(
"url,has_full_domain",
(
("https://example.com", (True, True, False, False)),
("https://example.com/", (True, True, False, False)),
("https://example.com/index.htm", (True, True, False, False)),
("https://example.com/index.html", (True, True, False, False)),
("https://example.com/index.php", (True, True, False, False)),
("https://example.com/home", (True, True, False, False)),
("https://example.com/some/category", (False, True, False, False)),
("https://example.com/some/category?pid=123", (False, True, False, False)),
),
)
def test_get_start_request_default_strategy(url, has_full_domain):
def assert_meta(has_page_params):
meta = {"crawling_logs": {"page_type": "productNavigation"}}
if has_page_params:
meta["page_params"] = {"full_domain": "example.com"}
assert result.meta == meta

for i, crawl_strategy in enumerate(
["automatic", "full", "navigation", "pagination_only"]
):
spider = EcommerceSpider.from_crawler(
get_crawler(), url=url, crawl_strategy=crawl_strategy
)
result = spider.get_start_request(url)
assert result.url == url
assert result.callback == spider.parse_navigation
assert_meta(has_full_domain[i])


@pytest.mark.parametrize(
"crawl_strategy,expected_page_params",
(
("automatic", {}),
("full", {"full_domain": "example.com"}),
("navigation", {}),
("pagination_only", {}),
),
)
def test_modify_page_params_for_heuristics(crawl_strategy, expected_page_params):
url = "https://example.com"
page_params = {"full_domain": "example.com"}

spider = EcommerceSpider.from_crawler(
get_crawler(), url=url, crawl_strategy=crawl_strategy
)
page_params = spider._modify_page_params_for_heuristics(page_params)
assert page_params == expected_page_params
63 changes: 62 additions & 1 deletion tests/test_heuristics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from zyte_spider_templates.heuristics import might_be_category
from zyte_spider_templates.heuristics import is_homepage, might_be_category


@pytest.mark.parametrize(
Expand Down Expand Up @@ -50,3 +50,64 @@
)
def test_might_be_category(test_input, expected):
assert might_be_category(test_input) == expected


LOCALES = (
"/us/en",
"/en/us",
"/us-en",
"/us_en",
"/AT_en",
"/pt-br",
"/PT-br",
"/en-us",
"/en-AT",
"/en",
"/uk",
)


@pytest.mark.parametrize(
"url_path,expected",
(
("", True),
("/", True),
("/index", True),
("/index.htm", True),
("/index.html", True),
("/index.php", True),
("/home", True),
("/home/", True),
("?ref=abc", False),
("/some/category", False),
("/some/category?query=2123", False),
),
)
@pytest.mark.parametrize("locale", LOCALES)
def test_is_homepage(locale, url_path, expected):
assert is_homepage("https://example.com" + url_path) == expected
assert is_homepage("https://example.com" + locale + url_path) == expected


@pytest.mark.parametrize(
"url",
(
"https://example.com/zz/dd",
"https://example.com/dd/zz",
"https://example.com/dd-zz",
"https://example.com/dd_zz",
"https://example.com/DD_zz",
"https://example.com/bb-DD",
"https://example.com/DD-BB",
"https://example.com/dd-zz",
"https://example.com/dd-ZZ",
"https://example.com/dd",
"https://example.com/zz",
),
)
def test_is_homepage_localization_bad(url):
"""If the url locale pattern doesn't match the country and language codes,
then it should not be identified as homepage.
"""
assert not is_homepage(url)
assert not is_homepage(url + "/")
188 changes: 188 additions & 0 deletions zyte_spider_templates/_lang_codes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# ISO 639-1 language codes
# Taken from https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes

LANG_CODES = [
"ab",
"aa",
"af",
"ak",
"sq",
"am",
"ar",
"an",
"hy",
"as",
"av",
"ae",
"ay",
"az",
"bm",
"ba",
"eu",
"be",
"bn",
"bi",
"bs",
"br",
"bg",
"my",
"ca",
"ch",
"ce",
"ny",
"zh",
"cu",
"cv",
"kw",
"co",
"cr",
"hr",
"cs",
"da",
"dv",
"nl",
"dz",
"en",
"eo",
"et",
"ee",
"fo",
"fj",
"fi",
"fr",
"fy",
"ff",
"gd",
"gl",
"lg",
"ka",
"de",
"el",
"kl",
"gn",
"gu",
"ht",
"ha",
"he",
"hz",
"hi",
"ho",
"hu",
"is",
"io",
"ig",
"id",
"ia",
"ie",
"iu",
"ik",
"ga",
"it",
"ja",
"jv",
"kn",
"kr",
"ks",
"kk",
"km",
"ki",
"rw",
"ky",
"kv",
"kg",
"ko",
"kj",
"ku",
"lo",
"la",
"lv",
"li",
"ln",
"lt",
"lu",
"lb",
"mk",
"mg",
"ms",
"ml",
"mt",
"gv",
"mi",
"mr",
"mh",
"mn",
"na",
"nv",
"nd",
"nr",
"ng",
"ne",
"no",
"nb",
"nn",
"ii",
"oc",
"oj",
"or",
"om",
"os",
"pi",
"ps",
"fa",
"pl",
"pt",
"pa",
"qu",
"ro",
"rm",
"rn",
"ru",
"se",
"sm",
"sg",
"sa",
"sc",
"sr",
"sn",
"sd",
"si",
"sk",
"sl",
"so",
"st",
"es",
"su",
"sw",
"ss",
"sv",
"tl",
"ty",
"tg",
"ta",
"tt",
"te",
"th",
"bo",
"ti",
"to",
"ts",
"tn",
"tr",
"tk",
"tw",
"ug",
"uk",
"ur",
"uz",
"ve",
"vi",
"vo",
"wa",
"cy",
"wo",
"xh",
"yi",
"yo",
"za",
"zu",
]
Loading

0 comments on commit ca9f214

Please sign in to comment.