diff --git a/tests/test_job_posting.py b/tests/test_job_posting.py index 15210e3..0a0a5d3 100644 --- a/tests/test_job_posting.py +++ b/tests/test_job_posting.py @@ -14,7 +14,14 @@ from scrapy.utils.defer import deferred_f_from_coro_f from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import get_spider_metadata -from zyte_common_items import JobPosting, JobPostingNavigation, ProbabilityRequest +from web_poet import BrowserResponse +from zyte_common_items import ( + JobPosting, + JobPostingNavigation, + ProbabilityRequest, + SearchRequestTemplate, + SearchRequestTemplateMetadata, +) from zyte_spider_templates._geolocations import ( GEOLOCATION_OPTIONS, @@ -338,6 +345,19 @@ def test_metadata(): "title": "URLs file", "type": "string", }, + "search_queries": { + "default": [], + "description": ( + "A list of search queries, one per line, to submit " + "using the search form found on each input URL. Only " + "works for input URLs that support search. May not " + "work on every website." + ), + "items": {"type": "string"}, + "title": "Search Queries", + "type": "array", + "widget": "textarea", + }, "crawl_strategy": { "default": "navigation", "description": ( @@ -643,6 +663,85 @@ def track_item(item, response, spider): } +def test_search_queries(): + crawler = get_crawler() + url = "https://example.com" + + spider = JobPostingSpider.from_crawler(crawler, url=url, search_queries="foo bar") + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_search_request_template + assert spider.args.search_queries == ["foo bar"] + + spider = JobPostingSpider.from_crawler(crawler, url=url, search_queries="foo\nbar") + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_search_request_template + assert spider.args.search_queries == ["foo", "bar"] + + spider = JobPostingSpider.from_crawler( + crawler, url=url, search_queries=["foo", "bar"] + ) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_search_request_template + assert spider.args.search_queries == ["foo", "bar"] + + +def test_search_queries_extract_from(): + crawler = get_crawler() + url = "https://example.com" + + spider = JobPostingSpider.from_crawler(crawler, url=url, search_queries="foo") + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert "inject" not in start_requests[0].meta + + spider = JobPostingSpider.from_crawler( + crawler, url=url, search_queries="foo", extract_from="httpResponseBody" + ) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert "inject" not in start_requests[0].meta + + spider = JobPostingSpider.from_crawler( + crawler, url=url, search_queries="foo", extract_from="browserHtml" + ) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].meta["inject"] == [BrowserResponse] + + +@pytest.mark.parametrize( + ("probability", "yields_items"), + ( + (None, True), # Default + (-1.0, False), + (0.0, False), # page.no_item_found() + (1.0, True), + ), +) +def test_parse_search_request_template_probability(probability, yields_items): + crawler = get_crawler() + spider = JobPostingSpider.from_crawler( + crawler, url="https://example.com", search_queries="foo" + ) + search_request_template = SearchRequestTemplate(url="https://example.com") + if probability is not None: + search_request_template.metadata = SearchRequestTemplateMetadata( + probability=probability + ) + items = list( + spider.parse_search_request_template( + DummyResponse("https://example.com"), search_request_template, DynamicDeps() + ) + ) + assert items if yields_items else not items + + @deferred_f_from_coro_f async def test_extract_jobs(zyte_api_server, jobs_website): items = await crawl_fake_zyte_api( @@ -725,3 +824,38 @@ async def test_extract_jobs_404(zyte_api_server, jobs_website): {"url": str(jobs_website.make_url("/jobs/foo"))}, ) assert not items + + +@deferred_f_from_coro_f +async def test_extract_search(zyte_api_server, jobs_website): + items = await crawl_fake_zyte_api( + zyte_api_server, + JobPostingSpider, + { + "url": str(jobs_website.make_url("/")), + "search_queries": "dEsIgn", + "max_requests": 10000, + }, + ) + assert len(items) == 437 + + +@deferred_f_from_coro_f +async def test_extract_search_empty(zyte_api_server, jobs_website): + items = await crawl_fake_zyte_api( + zyte_api_server, + JobPostingSpider, + {"url": str(jobs_website.make_url("/")), "search_queries": "does-not-exist"}, + ) + assert not items + + +@deferred_f_from_coro_f +async def test_extract_search_no_form(zyte_api_server, jobs_website, caplog): + items = await crawl_fake_zyte_api( + zyte_api_server, + JobPostingSpider, + {"url": str(jobs_website.make_url("/jobs/1")), "search_queries": "foo"}, + ) + assert not items + assert "Cannot build a search request template" in caplog.text diff --git a/tox.ini b/tox.ini index 3bbec04..1a095b0 100644 --- a/tox.ini +++ b/tox.ini @@ -7,8 +7,8 @@ deps = pytest-cov pytest-twisted freezegun - zyte-test-websites @ git+https://github.com/zytedata/zyte-test-websites@1172d5d - fake-zyte-api @ git+https://github.com/zytedata/fake-zyte-api@5ba47df + zyte-test-websites @ git+https://github.com/zytedata/zyte-test-websites@d9cb2f8 + fake-zyte-api @ git+https://github.com/zytedata/fake-zyte-api@f598933 commands = py.test \ --cov-report=html:coverage-html \ diff --git a/zyte_spider_templates/spiders/job_posting.py b/zyte_spider_templates/spiders/job_posting.py index 9e4d84a..e270025 100644 --- a/zyte_spider_templates/spiders/job_posting.py +++ b/zyte_spider_templates/spiders/job_posting.py @@ -1,18 +1,30 @@ from __future__ import annotations from enum import Enum -from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Optional, Union, cast +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Union, + cast, +) import scrapy from pydantic import BaseModel, ConfigDict, Field from scrapy.crawler import Crawler from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args +from web_poet import BrowserResponse from zyte_common_items import ( CustomAttributes, JobPosting, JobPostingNavigation, ProbabilityRequest, + SearchRequestTemplate, ) from zyte_spider_templates.spiders.base import ( @@ -25,9 +37,11 @@ from ..params import ( CustomAttrsInputParam, CustomAttrsMethodParam, + ExtractFrom, ExtractFromParam, GeolocationParam, MaxRequestsParam, + SearchQueriesParam, UrlParam, UrlsFileParam, UrlsParam, @@ -72,6 +86,22 @@ class JobPostingCrawlStrategyParam(BaseModel): ) +class JobPostingSearchQueriesParam(SearchQueriesParam): + search_queries: List[str] = Field( + title="Search Queries", + description=( + "A list of search queries, one per line, to submit using the " + "search form found on each input URL. Only works for input URLs " + "that support search. May not work on every website." + ), + default_factory=list, + json_schema_extra={ + "default": [], + "widget": "textarea", + }, + ) + + class JobPostingSpiderParams( CustomAttrsMethodParam, CustomAttrsInputParam, @@ -79,6 +109,7 @@ class JobPostingSpiderParams( MaxRequestsParam, GeolocationParam, JobPostingCrawlStrategyParam, + JobPostingSearchQueriesParam, UrlsFileParam, UrlsParam, UrlParam, @@ -158,9 +189,42 @@ def get_start_request(self, url): ) def start_requests(self) -> Iterable[scrapy.Request]: - for url in self.start_urls: + if self.args.search_queries: + for url in self.start_urls: + meta: Dict[str, Any] = { + "crawling_logs": {"page_type": "searchRequestTemplate"}, + } + if self.args.extract_from == ExtractFrom.browserHtml: + meta["inject"] = [BrowserResponse] + with self._log_request_exception: + yield scrapy.Request( + url=url, + callback=self.parse_search_request_template, + meta=meta, + ) + else: + for url in self.start_urls: + with self._log_request_exception: + yield self.get_start_request(url) + + def parse_search_request_template( + self, + response: DummyResponse, + search_request_template: SearchRequestTemplate, + dynamic: DynamicDeps, + ) -> Iterable[scrapy.Request]: + probability = search_request_template.get_probability() + if probability is not None and probability <= 0: + return + for query in self.args.search_queries: + meta: Dict[str, Any] = { + "crawling_logs": {"page_type": "jobPostingNavigation"}, + } with self._log_request_exception: - yield self.get_start_request(url) + yield search_request_template.request(query=query).to_scrapy( + callback=self.parse_navigation, + meta=meta, + ) def parse_navigation( self, response: DummyResponse, navigation: JobPostingNavigation