Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add search query support to the job posting spider. #115

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 135 additions & 1 deletion tests/test_job_posting.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,14 @@
from scrapy.utils.defer import deferred_f_from_coro_f
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import get_spider_metadata
from zyte_common_items import JobPosting, JobPostingNavigation, ProbabilityRequest
from web_poet import BrowserResponse
from zyte_common_items import (
JobPosting,
JobPostingNavigation,
ProbabilityRequest,
SearchRequestTemplate,
SearchRequestTemplateMetadata,
)

from zyte_spider_templates._geolocations import (
GEOLOCATION_OPTIONS,
Expand Down Expand Up @@ -338,6 +345,19 @@ def test_metadata():
"title": "URLs file",
"type": "string",
},
"search_queries": {
"default": [],
"description": (
"A list of search queries, one per line, to submit "
"using the search form found on each input URL. Only "
"works for input URLs that support search. May not "
"work on every website."
),
"items": {"type": "string"},
"title": "Search Queries",
"type": "array",
"widget": "textarea",
},
"crawl_strategy": {
"default": "navigation",
"description": (
Expand Down Expand Up @@ -643,6 +663,85 @@ def track_item(item, response, spider):
}


def test_search_queries():
crawler = get_crawler()
url = "https://example.com"

spider = JobPostingSpider.from_crawler(crawler, url=url, search_queries="foo bar")
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert start_requests[0].url == url
assert start_requests[0].callback == spider.parse_search_request_template
assert spider.args.search_queries == ["foo bar"]

spider = JobPostingSpider.from_crawler(crawler, url=url, search_queries="foo\nbar")
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert start_requests[0].url == url
assert start_requests[0].callback == spider.parse_search_request_template
assert spider.args.search_queries == ["foo", "bar"]

spider = JobPostingSpider.from_crawler(
crawler, url=url, search_queries=["foo", "bar"]
)
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert start_requests[0].url == url
assert start_requests[0].callback == spider.parse_search_request_template
assert spider.args.search_queries == ["foo", "bar"]


def test_search_queries_extract_from():
crawler = get_crawler()
url = "https://example.com"

spider = JobPostingSpider.from_crawler(crawler, url=url, search_queries="foo")
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert "inject" not in start_requests[0].meta

spider = JobPostingSpider.from_crawler(
crawler, url=url, search_queries="foo", extract_from="httpResponseBody"
)
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert "inject" not in start_requests[0].meta

spider = JobPostingSpider.from_crawler(
crawler, url=url, search_queries="foo", extract_from="browserHtml"
)
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert start_requests[0].meta["inject"] == [BrowserResponse]


@pytest.mark.parametrize(
("probability", "yields_items"),
(
(None, True), # Default
(-1.0, False),
(0.0, False), # page.no_item_found()
(1.0, True),
),
)
def test_parse_search_request_template_probability(probability, yields_items):
crawler = get_crawler()
spider = JobPostingSpider.from_crawler(
crawler, url="https://example.com", search_queries="foo"
)
search_request_template = SearchRequestTemplate(url="https://example.com")
if probability is not None:
search_request_template.metadata = SearchRequestTemplateMetadata(
probability=probability
)
items = list(
spider.parse_search_request_template(
DummyResponse("https://example.com"), search_request_template, DynamicDeps()
)
)
assert items if yields_items else not items


@deferred_f_from_coro_f
async def test_extract_jobs(zyte_api_server, jobs_website):
items = await crawl_fake_zyte_api(
Expand Down Expand Up @@ -725,3 +824,38 @@ async def test_extract_jobs_404(zyte_api_server, jobs_website):
{"url": str(jobs_website.make_url("/jobs/foo"))},
)
assert not items


@deferred_f_from_coro_f
async def test_extract_search(zyte_api_server, jobs_website):
items = await crawl_fake_zyte_api(
zyte_api_server,
JobPostingSpider,
{
"url": str(jobs_website.make_url("/")),
"search_queries": "dEsIgn",
"max_requests": 10000,
},
)
assert len(items) == 437
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious - how long does it take to run this test?



@deferred_f_from_coro_f
async def test_extract_search_empty(zyte_api_server, jobs_website):
items = await crawl_fake_zyte_api(
zyte_api_server,
JobPostingSpider,
{"url": str(jobs_website.make_url("/")), "search_queries": "does-not-exist"},
)
assert not items


@deferred_f_from_coro_f
async def test_extract_search_no_form(zyte_api_server, jobs_website, caplog):
items = await crawl_fake_zyte_api(
zyte_api_server,
JobPostingSpider,
{"url": str(jobs_website.make_url("/jobs/1")), "search_queries": "foo"},
)
assert not items
assert "Cannot build a search request template" in caplog.text
4 changes: 2 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ deps =
pytest-cov
pytest-twisted
freezegun
zyte-test-websites @ git+https://github.com/zytedata/zyte-test-websites@1172d5d
fake-zyte-api @ git+https://github.com/zytedata/fake-zyte-api@5ba47df
zyte-test-websites @ git+https://github.com/zytedata/zyte-test-websites@d9cb2f8
fake-zyte-api @ git+https://github.com/zytedata/fake-zyte-api@f598933
commands =
py.test \
--cov-report=html:coverage-html \
Expand Down
70 changes: 67 additions & 3 deletions zyte_spider_templates/spiders/job_posting.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,30 @@
from __future__ import annotations

from enum import Enum
from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Optional, Union, cast
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
Optional,
Union,
cast,
)

import scrapy
from pydantic import BaseModel, ConfigDict, Field
from scrapy.crawler import Crawler
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import Args
from web_poet import BrowserResponse
from zyte_common_items import (
CustomAttributes,
JobPosting,
JobPostingNavigation,
ProbabilityRequest,
SearchRequestTemplate,
)

from zyte_spider_templates.spiders.base import (
Expand All @@ -25,9 +37,11 @@
from ..params import (
CustomAttrsInputParam,
CustomAttrsMethodParam,
ExtractFrom,
ExtractFromParam,
GeolocationParam,
MaxRequestsParam,
SearchQueriesParam,
UrlParam,
UrlsFileParam,
UrlsParam,
Expand Down Expand Up @@ -72,13 +86,30 @@ class JobPostingCrawlStrategyParam(BaseModel):
)


class JobPostingSearchQueriesParam(SearchQueriesParam):
search_queries: List[str] = Field(
title="Search Queries",
description=(
"A list of search queries, one per line, to submit using the "
"search form found on each input URL. Only works for input URLs "
"that support search. May not work on every website."
),
default_factory=list,
json_schema_extra={
"default": [],
"widget": "textarea",
},
)


class JobPostingSpiderParams(
CustomAttrsMethodParam,
CustomAttrsInputParam,
ExtractFromParam,
MaxRequestsParam,
GeolocationParam,
JobPostingCrawlStrategyParam,
JobPostingSearchQueriesParam,
UrlsFileParam,
UrlsParam,
UrlParam,
Expand Down Expand Up @@ -158,9 +189,42 @@ def get_start_request(self, url):
)

def start_requests(self) -> Iterable[scrapy.Request]:
for url in self.start_urls:
if self.args.search_queries:
for url in self.start_urls:
meta: Dict[str, Any] = {
"crawling_logs": {"page_type": "searchRequestTemplate"},
}
if self.args.extract_from == ExtractFrom.browserHtml:
meta["inject"] = [BrowserResponse]
with self._log_request_exception:
yield scrapy.Request(
url=url,
callback=self.parse_search_request_template,
meta=meta,
)
else:
for url in self.start_urls:
with self._log_request_exception:
yield self.get_start_request(url)

def parse_search_request_template(
self,
response: DummyResponse,
search_request_template: SearchRequestTemplate,
dynamic: DynamicDeps,
) -> Iterable[scrapy.Request]:
probability = search_request_template.get_probability()
if probability is not None and probability <= 0:
return
for query in self.args.search_queries:
meta: Dict[str, Any] = {
"crawling_logs": {"page_type": "jobPostingNavigation"},
}
with self._log_request_exception:
yield self.get_start_request(url)
yield search_request_template.request(query=query).to_scrapy(
callback=self.parse_navigation,
meta=meta,
)

def parse_navigation(
self, response: DummyResponse, navigation: JobPostingNavigation
Expand Down