From 536b9f26137bd6a3e3dc5093444ea51840930320 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 30 Dec 2024 15:30:47 +0500 Subject: [PATCH 1/4] Add search query support to the job posting spider. --- tests/test_job_posting.py | 101 ++++++++++++++++++- zyte_spider_templates/spiders/job_posting.py | 70 ++++++++++++- 2 files changed, 167 insertions(+), 4 deletions(-) diff --git a/tests/test_job_posting.py b/tests/test_job_posting.py index af1af86..112d3a3 100644 --- a/tests/test_job_posting.py +++ b/tests/test_job_posting.py @@ -9,7 +9,14 @@ from scrapy import signals from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import get_spider_metadata -from zyte_common_items import JobPosting, JobPostingNavigation, ProbabilityRequest +from web_poet import BrowserResponse +from zyte_common_items import ( + JobPosting, + JobPostingNavigation, + ProbabilityRequest, + SearchRequestTemplate, + SearchRequestTemplateMetadata, +) from zyte_spider_templates._geolocations import ( GEOLOCATION_OPTIONS, @@ -322,6 +329,19 @@ def test_metadata(): "title": "URLs file", "type": "string", }, + "search_queries": { + "default": [], + "description": ( + "A list of search queries, one per line, to submit " + "using the search form found on each input URL. Only " + "works for input URLs that support search. May not " + "work on every website." + ), + "items": {"type": "string"}, + "title": "Search Queries", + "type": "array", + "widget": "textarea", + }, "crawl_strategy": { "default": "navigation", "description": ( @@ -628,3 +648,82 @@ def track_item(item, response, spider): "https://jobs.offsite.example/jobs/1", "https://jobs.offsite.example/jobs/2", } + + +def test_search_queries(): + crawler = get_crawler() + url = "https://example.com" + + spider = JobPostingSpider.from_crawler(crawler, url=url, search_queries="foo bar") + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_search_request_template + assert spider.args.search_queries == ["foo bar"] + + spider = JobPostingSpider.from_crawler(crawler, url=url, search_queries="foo\nbar") + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_search_request_template + assert spider.args.search_queries == ["foo", "bar"] + + spider = JobPostingSpider.from_crawler( + crawler, url=url, search_queries=["foo", "bar"] + ) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_search_request_template + assert spider.args.search_queries == ["foo", "bar"] + + +def test_search_queries_extract_from(): + crawler = get_crawler() + url = "https://example.com" + + spider = JobPostingSpider.from_crawler(crawler, url=url, search_queries="foo") + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert "inject" not in start_requests[0].meta + + spider = JobPostingSpider.from_crawler( + crawler, url=url, search_queries="foo", extract_from="httpResponseBody" + ) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert "inject" not in start_requests[0].meta + + spider = JobPostingSpider.from_crawler( + crawler, url=url, search_queries="foo", extract_from="browserHtml" + ) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].meta["inject"] == [BrowserResponse] + + +@pytest.mark.parametrize( + ("probability", "yields_items"), + ( + (None, True), # Default + (-1.0, False), + (0.0, False), # page.no_item_found() + (1.0, True), + ), +) +def test_parse_search_request_template_probability(probability, yields_items): + crawler = get_crawler() + spider = JobPostingSpider.from_crawler( + crawler, url="https://example.com", search_queries="foo" + ) + search_request_template = SearchRequestTemplate(url="https://example.com") + if probability is not None: + search_request_template.metadata = SearchRequestTemplateMetadata( + probability=probability + ) + items = list( + spider.parse_search_request_template( + DummyResponse("https://example.com"), search_request_template, DynamicDeps() + ) + ) + assert items if yields_items else not items diff --git a/zyte_spider_templates/spiders/job_posting.py b/zyte_spider_templates/spiders/job_posting.py index c8e01f4..b646fa7 100644 --- a/zyte_spider_templates/spiders/job_posting.py +++ b/zyte_spider_templates/spiders/job_posting.py @@ -1,7 +1,17 @@ from __future__ import annotations from enum import Enum -from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Optional, Union, cast +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Union, + cast, +) import requests import scrapy @@ -9,11 +19,13 @@ from scrapy.crawler import Crawler from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args +from web_poet import BrowserResponse from zyte_common_items import ( CustomAttributes, JobPosting, JobPostingNavigation, ProbabilityRequest, + SearchRequestTemplate, ) from zyte_spider_templates.spiders.base import ( @@ -27,9 +39,11 @@ from ..params import ( CustomAttrsInputParam, CustomAttrsMethodParam, + ExtractFrom, ExtractFromParam, GeolocationParam, MaxRequestsParam, + SearchQueriesParam, UrlParam, UrlsFileParam, UrlsParam, @@ -74,6 +88,22 @@ class JobPostingCrawlStrategyParam(BaseModel): ) +class JobPostingSearchQueriesParam(SearchQueriesParam): + search_queries: List[str] = Field( + title="Search Queries", + description=( + "A list of search queries, one per line, to submit using the " + "search form found on each input URL. Only works for input URLs " + "that support search. May not work on every website." + ), + default_factory=list, + json_schema_extra={ + "default": [], + "widget": "textarea", + }, + ) + + class JobPostingSpiderParams( CustomAttrsMethodParam, CustomAttrsInputParam, @@ -81,6 +111,7 @@ class JobPostingSpiderParams( MaxRequestsParam, GeolocationParam, JobPostingCrawlStrategyParam, + JobPostingSearchQueriesParam, UrlsFileParam, UrlsParam, UrlParam, @@ -173,9 +204,42 @@ def get_start_request(self, url): ) def start_requests(self) -> Iterable[scrapy.Request]: - for url in self.start_urls: + if self.args.search_queries: + for url in self.start_urls: + meta: Dict[str, Any] = { + "crawling_logs": {"page_type": "searchRequestTemplate"}, + } + if self.args.extract_from == ExtractFrom.browserHtml: + meta["inject"] = [BrowserResponse] + with self._log_request_exception: + yield scrapy.Request( + url=url, + callback=self.parse_search_request_template, + meta=meta, + ) + else: + for url in self.start_urls: + with self._log_request_exception: + yield self.get_start_request(url) + + def parse_search_request_template( + self, + response: DummyResponse, + search_request_template: SearchRequestTemplate, + dynamic: DynamicDeps, + ) -> Iterable[scrapy.Request]: + probability = search_request_template.get_probability() + if probability is not None and probability <= 0: + return + for query in self.args.search_queries: + meta: Dict[str, Any] = { + "crawling_logs": {"page_type": "jobPostingNavigation"}, + } with self._log_request_exception: - yield self.get_start_request(url) + yield search_request_template.request(query=query).to_scrapy( + callback=self.parse_navigation, + meta=meta, + ) def parse_navigation( self, response: DummyResponse, navigation: JobPostingNavigation From 277d1b6704c05506fab44d3c25578864384688e4 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 10 Mar 2025 11:37:15 +0500 Subject: [PATCH 2/4] Add job posting search tests with fake-zyte-api. --- tests/test_job_posting.py | 24 ++++++++++++++++++++++++ tox.ini | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tests/test_job_posting.py b/tests/test_job_posting.py index 6dde24c..8419fb0 100644 --- a/tests/test_job_posting.py +++ b/tests/test_job_posting.py @@ -824,3 +824,27 @@ async def test_extract_jobs_404(zyte_api_server, jobs_website): {"url": str(jobs_website.make_url("/jobs/foo"))}, ) assert not items + + +@deferred_f_from_coro_f +async def test_extract_search(zyte_api_server, jobs_website): + items = await crawl_fake_zyte_api( + zyte_api_server, + JobPostingSpider, + { + "url": str(jobs_website.make_url("/")), + "search_queries": "dEsIgn", + "max_requests": 10000, + }, + ) + assert len(items) == 437 + + +@deferred_f_from_coro_f +async def test_extract_search_empty(zyte_api_server, jobs_website): + items = await crawl_fake_zyte_api( + zyte_api_server, + JobPostingSpider, + {"url": str(jobs_website.make_url("/")), "search_queries": "does-not-exist"}, + ) + assert not items diff --git a/tox.ini b/tox.ini index 3bbec04..e4574f0 100644 --- a/tox.ini +++ b/tox.ini @@ -7,7 +7,7 @@ deps = pytest-cov pytest-twisted freezegun - zyte-test-websites @ git+https://github.com/zytedata/zyte-test-websites@1172d5d + zyte-test-websites @ git+https://github.com/zytedata/zyte-test-websites@d9cb2f8 fake-zyte-api @ git+https://github.com/zytedata/fake-zyte-api@5ba47df commands = py.test \ From 6bdc8be059d01eb8e6bab9d23db3cf87bde600a6 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 10 Mar 2025 14:11:45 +0500 Subject: [PATCH 3/4] Update the fake-zyte-api dep version. --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index e4574f0..1a095b0 100644 --- a/tox.ini +++ b/tox.ini @@ -8,7 +8,7 @@ deps = pytest-twisted freezegun zyte-test-websites @ git+https://github.com/zytedata/zyte-test-websites@d9cb2f8 - fake-zyte-api @ git+https://github.com/zytedata/fake-zyte-api@5ba47df + fake-zyte-api @ git+https://github.com/zytedata/fake-zyte-api@f598933 commands = py.test \ --cov-report=html:coverage-html \ From 06866b36dbd6c7e6a47286e73f3375dc41c4f816 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 10 Mar 2025 14:20:41 +0500 Subject: [PATCH 4/4] Ad a search test for a page without a search form. --- tests/test_job_posting.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_job_posting.py b/tests/test_job_posting.py index 8419fb0..0a0a5d3 100644 --- a/tests/test_job_posting.py +++ b/tests/test_job_posting.py @@ -848,3 +848,14 @@ async def test_extract_search_empty(zyte_api_server, jobs_website): {"url": str(jobs_website.make_url("/")), "search_queries": "does-not-exist"}, ) assert not items + + +@deferred_f_from_coro_f +async def test_extract_search_no_form(zyte_api_server, jobs_website, caplog): + items = await crawl_fake_zyte_api( + zyte_api_server, + JobPostingSpider, + {"url": str(jobs_website.make_url("/jobs/1")), "search_queries": "foo"}, + ) + assert not items + assert "Cannot build a search request template" in caplog.text