Skip to content

Commit

Permalink
Refactor new tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR committed Mar 6, 2025
1 parent bc1e0b9 commit c6ef241
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 78 deletions.
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ async def go(


@pytest_twisted.async_fixture(scope="module")
async def zyte_api_server(aiohttp_server):
async def zyte_api_server(aiohttp_server) -> TestServer:
from fake_zyte_api.main import make_app

app = make_app()
Expand Down
102 changes: 25 additions & 77 deletions tests/test_job_posting.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from itemadapter import ItemAdapter
from pydantic import ValidationError
from scrapy import signals
from scrapy.utils.defer import deferred_f_from_coro_f, deferred_to_future
from scrapy.utils.defer import deferred_f_from_coro_f
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import get_spider_metadata
from zyte_common_items import JobPosting, JobPostingNavigation, ProbabilityRequest
Expand All @@ -28,7 +28,7 @@

from . import get_crawler
from .test_utils import URL_TO_DOMAIN
from .utils import assertEqualSpiderMetadata, get_addons
from .utils import assertEqualSpiderMetadata, crawl_fake_zyte_api, get_addons

if TYPE_CHECKING:
from aiohttp.pytest_plugin import AiohttpServer
Expand Down Expand Up @@ -645,96 +645,52 @@ def track_item(item, response, spider):

@deferred_f_from_coro_f
async def test_extract_jobs(zyte_api_server, jobs_website):
settings = {
"ZYTE_API_URL": str(zyte_api_server.make_url("/")),
"ZYTE_API_KEY": "a",
"ADDONS": get_addons(),
}
crawler = get_crawler(settings=settings, spider_cls=JobPostingSpider)
items = []

def track_item(item, response, spider):
items.append(item)

crawler.signals.connect(track_item, signal=signals.item_scraped)
await deferred_to_future(
crawler.crawl(url=str(jobs_website.make_url("/jobs/4")), max_requests=1000)
items = await crawl_fake_zyte_api(
zyte_api_server,
JobPostingSpider,
{"url": str(jobs_website.make_url("/jobs/4")), "max_requests": 1000},
)

assert len(items) == 109
assert len(set(item.url for item in items)) == len(items)
assert len(set(item.jobPostingId for item in items)) == len(items)


@deferred_f_from_coro_f
async def test_extract_jobs_url_list(zyte_api_server, jobs_website):
settings = {
"ZYTE_API_URL": str(zyte_api_server.make_url("/")),
"ZYTE_API_KEY": "a",
"ADDONS": get_addons(),
}
crawler = get_crawler(settings=settings, spider_cls=JobPostingSpider)
items = []

def track_item(item, response, spider):
items.append(item)

crawler.signals.connect(track_item, signal=signals.item_scraped)
await deferred_to_future(
crawler.crawl(
urls="\n".join(
items = await crawl_fake_zyte_api(
zyte_api_server,
JobPostingSpider,
{
"urls": "\n".join(
[
str(jobs_website.make_url("/jobs/1")),
str(jobs_website.make_url("/jobs/4")),
]
),
max_requests=1000,
)
"max_requests": 1000,
},
)

assert len(items) == 5 + 109
assert len(set(item.url for item in items)) == len(items)
assert len(set(item.jobPostingId for item in items)) == len(items)


@deferred_f_from_coro_f
async def test_extract_jobs_max_reqs(zyte_api_server, jobs_website):
settings = {
"ZYTE_API_URL": str(zyte_api_server.make_url("/")),
"ZYTE_API_KEY": "a",
"ADDONS": get_addons(),
}
crawler = get_crawler(settings=settings, spider_cls=JobPostingSpider)
items = []

def track_item(item, response, spider):
items.append(item)

crawler.signals.connect(track_item, signal=signals.item_scraped)
await deferred_to_future(
crawler.crawl(url=str(jobs_website.make_url("/jobs/4")), max_requests=20)
items = await crawl_fake_zyte_api(
zyte_api_server,
JobPostingSpider,
{"url": str(jobs_website.make_url("/jobs/4")), "max_requests": 20},
)

assert len(items) < 20


@deferred_f_from_coro_f
async def test_extract_direct_item(zyte_api_server, jobs_website):
settings = {
"ZYTE_API_URL": str(zyte_api_server.make_url("/")),
"ZYTE_API_KEY": "a",
"ADDONS": get_addons(),
}
crawler = get_crawler(settings=settings, spider_cls=JobPostingSpider)
items = []

def track_item(item, response, spider):
items.append(item)

crawler.signals.connect(track_item, signal=signals.item_scraped)
url = str(jobs_website.make_url("/job/1888448280485890"))
await deferred_to_future(crawler.crawl(url=url, crawl_strategy="direct_item"))

items = await crawl_fake_zyte_api(
zyte_api_server, JobPostingSpider, {"url": url, "crawl_strategy": "direct_item"}
)
assert len(items) == 1
descr = (
"Family Law Attorneys deal with legal matters related to family"
Expand Down Expand Up @@ -763,17 +719,9 @@ def track_item(item, response, spider):

@deferred_f_from_coro_f
async def test_extract_jobs_404(zyte_api_server, jobs_website):
settings = {
"ZYTE_API_URL": str(zyte_api_server.make_url("/")),
"ZYTE_API_KEY": "a",
"ADDONS": get_addons(),
}
crawler = get_crawler(settings=settings, spider_cls=JobPostingSpider)
items = []

def track_item(item, response, spider):
items.append(item)

crawler.signals.connect(track_item, signal=signals.item_scraped)
await deferred_to_future(crawler.crawl(url=str(jobs_website.make_url("/jobs/foo"))))
items = await crawl_fake_zyte_api(
zyte_api_server,
JobPostingSpider,
{"url": str(jobs_website.make_url("/jobs/foo"))},
)
assert not items
33 changes: 33 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
from __future__ import annotations

import json
from typing import Any

from aiohttp.test_utils import TestServer
from scrapy import Spider, signals
from scrapy.utils.defer import deferred_to_future

from . import get_crawler


def assertEqualSpiderMetadata(actual, expected):
Expand Down Expand Up @@ -33,3 +40,29 @@ def get_addons() -> dict[str | type, int]:
else:
addons[Addon] = 300
return addons


def get_zyte_api_settings(zyte_api_server) -> dict[str, Any]:
return {
"ZYTE_API_URL": str(zyte_api_server.make_url("/")),
"ZYTE_API_KEY": "a",
"ADDONS": get_addons(),
}


async def crawl_fake_zyte_api(
zyte_api_server: TestServer,
spider_cls: type[Spider],
spider_kwargs: dict[str, Any],
settings: dict[str, Any] | None = None,
):
settings = {**get_zyte_api_settings(zyte_api_server), **(settings or {})}
crawler = get_crawler(settings=settings, spider_cls=spider_cls)
items = []

def track_item(item, response, spider):
items.append(item)

crawler.signals.connect(track_item, signal=signals.item_scraped)
await deferred_to_future(crawler.crawl(**spider_kwargs))
return items

0 comments on commit c6ef241

Please sign in to comment.