Skip to content

Commit

Permalink
zyte-common-items ≥ 0.25.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Nov 11, 2024
1 parent dd18ec4 commit 4b771fb
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 35 deletions.
2 changes: 1 addition & 1 deletion docs/customization/pages.rst
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,4 @@ For example:
class ExampleComSearchRequestTemplatePage(BaseSearchRequestTemplatePage):
@field
def url(self):
return "https://example.com/search?q={{ keyword|quote_plus }}"
return "https://example.com/search?q={{ query|quote_plus }}"
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"scrapy-spider-metadata>=0.2.0",
"scrapy-zyte-api[provider]>=0.23.0",
"web-poet>=0.17.1",
"zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@search-request-template-ancestry",
"zyte-common-items>=0.25.0",
],
classifiers=[
"Development Status :: 3 - Alpha",
Expand Down
44 changes: 22 additions & 22 deletions tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
""",
{"search_request_builders": ["extruct"]},
{
"url": "https://query.example.com/search?q={{ keyword|quote_plus }}",
"url": "https://query.example.com/search?q={{ query|quote_plus }}",
},
),
# Microdata example from Google
Expand All @@ -57,7 +57,7 @@
""",
{"search_request_builders": ["extruct"]},
{
"url": "https://query.example.com/search?q={{ keyword|quote_plus }}",
"url": "https://query.example.com/search?q={{ query|quote_plus }}",
},
),
# Non-compliant JSON-LD that uses a JSON array for potentialAction
Expand Down Expand Up @@ -89,7 +89,7 @@
""",
{"search_request_builders": ["extruct"]},
{
"url": "https://query.example.com/search?q={{ keyword|quote_plus }}",
"url": "https://query.example.com/search?q={{ query|quote_plus }}",
},
),
# Non-default placeholder, JSON-LD
Expand Down Expand Up @@ -119,7 +119,7 @@
""",
{"search_request_builders": ["extruct"]},
{
"url": "https://query.example.com/search?q={{ keyword|quote_plus }}&dont_replace={search_term_string}",
"url": "https://query.example.com/search?q={{ query|quote_plus }}&dont_replace={search_term_string}",
},
),
# Non-default placeholder, Microdata
Expand All @@ -136,7 +136,7 @@
""",
{"search_request_builders": ["extruct"]},
{
"url": "https://query.example.com/search?q={{ keyword|quote_plus }}&dont_replace={search_term_string}",
"url": "https://query.example.com/search?q={{ query|quote_plus }}&dont_replace={search_term_string}",
},
),
# JSON-LD, WebSite isPartOf WebPage
Expand Down Expand Up @@ -170,7 +170,7 @@
""",
{"search_request_builders": ["extruct"]},
{
"url": "https://query.example.com/search?q={{ keyword|quote_plus }}",
"url": "https://query.example.com/search?q={{ query|quote_plus }}",
},
),
# Relative URL, JSON-LD
Expand Down Expand Up @@ -200,7 +200,7 @@
""",
{"search_request_builders": ["extruct"]},
{
"url": "https://example.com/search?q={{ keyword|quote_plus }}",
"url": "https://example.com/search?q={{ query|quote_plus }}",
},
),
# Relative URL, Microdata
Expand All @@ -217,7 +217,7 @@
""",
{"search_request_builders": ["extruct"]},
{
"url": "https://example.com/search?q={{ keyword|quote_plus }}",
"url": "https://example.com/search?q={{ query|quote_plus }}",
},
),
# Wrong escaping in JSON-LD
Expand Down Expand Up @@ -245,7 +245,7 @@
""",
{"search_request_builders": ["extruct"]},
{
"url": "https://example.com/search?a=b&q={{ keyword|quote_plus }}",
"url": "https://example.com/search?a=b&q={{ query|quote_plus }}",
},
),
# Query in path, JSON-LD
Expand Down Expand Up @@ -275,7 +275,7 @@
""",
{"search_request_builders": ["extruct"]},
{
"url": "https://example.com/s/{{ keyword|urlencode }}",
"url": "https://example.com/s/{{ query|urlencode }}",
},
),
# Relative URL, Microdata
Expand All @@ -292,7 +292,7 @@
""",
{"search_request_builders": ["extruct"]},
{
"url": "https://example.com/s/{{ keyword|urlencode }}",
"url": "https://example.com/s/{{ query|urlencode }}",
},
),
# No potentialAction, JSON-LD
Expand Down Expand Up @@ -415,7 +415,7 @@
{"search_request_builders": ["extruct"]},
{"error": "Try enabling browser rendering"},
),
# No keyword name, JSON-LD
# No query variable name, JSON-LD
(
b"""
<html>
Expand All @@ -442,10 +442,10 @@
""",
{"search_request_builders": ["extruct"]},
{
"url": "https://query.example.com/search?q={{ keyword|quote_plus }}",
"url": "https://query.example.com/search?q={{ query|quote_plus }}",
},
),
# No keyword name, Microdata
# No query variable name, Microdata
(
b"""
<div itemscope itemtype="https://schema.org/WebSite">
Expand All @@ -459,7 +459,7 @@
""",
{"search_request_builders": ["extruct"]},
{
"url": "https://query.example.com/search?q={{ keyword|quote_plus }}",
"url": "https://query.example.com/search?q={{ query|quote_plus }}",
},
),
# Formasaurus and form heuristics #-----------------------------------#
Expand All @@ -476,7 +476,7 @@
</form>
""",
{
"url": "https://example.com?q={{ keyword|quote_plus }}",
"url": "https://example.com?q={{ query|quote_plus }}",
},
),
# No form
Expand Down Expand Up @@ -517,7 +517,7 @@
f"""<a href="https://example.com/search?{prefix}{q}=example{suffix}""".encode(),
{"search_request_builders": ["link_heuristics"]},
{
"url": f"https://example.com/search?{prefix}{q}={{{{ keyword|quote_plus }}}}{suffix}"
"url": f"https://example.com/search?{prefix}{q}={{{{ query|quote_plus }}}}{suffix}"
},
)
for q in (
Expand Down Expand Up @@ -577,14 +577,14 @@
# priority, if both the Formasaurus builder and the form
# heuristics builder output the same URL, that one is used
# instead.
({}, {"url": "https://example.com/form?q={{ keyword|quote_plus }}"}),
({}, {"url": "https://example.com/form?q={{ query|quote_plus }}"}),
(
{"search_request_builder_strategy": "popular"},
{"url": "https://example.com/form?q={{ keyword|quote_plus }}"},
{"url": "https://example.com/form?q={{ query|quote_plus }}"},
),
(
{"search_request_builder_strategy": "first"},
{"url": "https://example.com/metadata?q={{ keyword|quote_plus }}"},
{"url": "https://example.com/metadata?q={{ query|quote_plus }}"},
),
# Strategies only take into account the specified builders, and
# in the supplied order.
Expand All @@ -593,7 +593,7 @@
"search_request_builder_strategy": "first",
"search_request_builders": ["formasaurus", "extruct"],
},
{"url": "https://example.com/form?q={{ keyword|quote_plus }}"},
{"url": "https://example.com/form?q={{ query|quote_plus }}"},
),
(
{
Expand All @@ -604,7 +604,7 @@
"link_heuristics",
],
},
{"url": "https://example.com/metadata?q={{ keyword|quote_plus }}"},
{"url": "https://example.com/metadata?q={{ query|quote_plus }}"},
),
# Unsupported strategies trigger a ValueError
(
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ deps =
scrapy-spider-metadata==0.2.0
scrapy-zyte-api[provider]==0.23.0
web-poet==0.17.1
; zyte-common-items
zyte-common-items==0.25.0

[testenv:mypy]
deps =
Expand Down
16 changes: 7 additions & 9 deletions zyte_spider_templates/pages/search_request_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,11 @@ def _item_from_form_heuristics(self):
except NotImplementedError:
raise ValueError("form2request does not support the target search form")
return SearchRequestTemplate(
url=request_data.url.replace(_PLACEHOLDER, "{{ keyword|quote_plus }}"),
url=request_data.url.replace(_PLACEHOLDER, "{{ query|quote_plus }}"),
method=request_data.method,
headers=request_data.headers,
body=request_data.body.decode().replace(
_PLACEHOLDER, "{{ keyword|quote_plus }}"
_PLACEHOLDER, "{{ query|quote_plus }}"
),
)

Expand Down Expand Up @@ -166,11 +166,9 @@ def _item_from_extruct(self):
"Could not find HTML metadata to compose a search request template."
)
parts = url_template.split("?", maxsplit=1)
parts[0] = parts[0].replace(f"{{{query_field}}}", "{{ keyword|urlencode }}")
parts[0] = parts[0].replace(f"{{{query_field}}}", "{{ query|urlencode }}")
if len(parts) > 1:
parts[1] = parts[1].replace(
f"{{{query_field}}}", "{{ keyword|quote_plus }}"
)
parts[1] = parts[1].replace(f"{{{query_field}}}", "{{ query|quote_plus }}")
url = "?".join(parts)
url = str(self.response.urljoin(url))
url = html.unescape(url)
Expand Down Expand Up @@ -218,7 +216,7 @@ def _item_from_link_heuristics(self):
url = add_or_replace_parameters(
search_link.url, {k: _PLACEHOLDER for k in search_params}
)
url = url.replace(_PLACEHOLDER, "{{ keyword|quote_plus }}")
url = url.replace(_PLACEHOLDER, "{{ query|quote_plus }}")
return SearchRequestTemplate(
url=url,
method="GET",
Expand Down Expand Up @@ -250,11 +248,11 @@ def _item_from_formasaurus(self):
except NotImplementedError:
raise ValueError("form2request does not support the target search form")
return SearchRequestTemplate(
url=request_data.url.replace(_PLACEHOLDER, "{{ keyword|quote_plus }}"),
url=request_data.url.replace(_PLACEHOLDER, "{{ query|quote_plus }}"),
method=request_data.method,
headers=request_data.headers,
body=request_data.body.decode().replace(
_PLACEHOLDER, "{{ keyword|quote_plus }}"
_PLACEHOLDER, "{{ query|quote_plus }}"
),
)

Expand Down
2 changes: 1 addition & 1 deletion zyte_spider_templates/spiders/ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def parse_search_request_template(
if probability is None or probability <= 0:
return
for query in self.args.search_queries:
yield search_request_template.request(keyword=query).to_scrapy(
yield search_request_template.request(query=query).to_scrapy(
callback=self.parse_navigation,
meta={
"crawling_logs": {"page_type": "productNavigation"},
Expand Down

0 comments on commit 4b771fb

Please sign in to comment.