Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into article-custom-attrs
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR committed Jan 16, 2025
2 parents 19065d4 + 5a930c7 commit 43cfa38
Show file tree
Hide file tree
Showing 15 changed files with 149 additions and 71 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.11.0
current_version = 0.11.2
commit = True
tag = True
tag_name = {new_version}
Expand Down
11 changes: 11 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
Changes
=======

0.11.2 (2024-12-30)
-------------------

* Do not log warning about disabled components.

0.11.1 (2024-12-26)
-------------------

* The :ref:`e-commerce <e-commerce>` and :ref:`job posting <job-posting>`
spider templates no longer ignore item requests for a different domain.

0.11.0 (2024-12-16)
-------------------

Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
project = "zyte-spider-templates"
copyright = "2023, Zyte Group Ltd"
author = "Zyte Group Ltd"
release = "0.11.0"
release = "0.11.2"

sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext
extensions = [
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name="zyte-spider-templates",
version="0.11.0",
version="0.11.2",
description="Spider templates for automatic crawlers.",
long_description=open("README.rst").read(),
long_description_content_type="text/x-rst",
Expand Down
12 changes: 2 additions & 10 deletions tests/incremental/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,8 @@ def test_middleware_init_not_configured():
crawler = crawler_for_incremental()
crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": False})

with pytest.raises(NotConfigured) as exc_info:
with pytest.raises(NotConfigured):
IncrementalCrawlMiddleware(crawler)
assert str(exc_info.value) == (
"IncrementalCrawlMiddleware is not enabled. Set the "
"INCREMENTAL_CRAWL_ENABLED setting to True to enable it."
)


@patch("scrapinghub.ScrapinghubClient")
Expand All @@ -59,12 +55,8 @@ def test_prepare_manager_with_collection_fp_failure(caplog):
crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True})

caplog.clear()
with pytest.raises(CloseSpider) as exc_info:
with pytest.raises(CloseSpider):
IncrementalCrawlMiddleware.prepare_incremental_manager(crawler)
assert exc_info.value.reason == "incremental_crawling_middleware_collection_issue"
assert caplog.messages[-1].startswith(
"IncrementalCrawlMiddleware is enabled, but something went wrong with Collections."
)


@patch("scrapinghub.ScrapinghubClient")
Expand Down
77 changes: 53 additions & 24 deletions tests/mockserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,30 +26,41 @@ class DefaultResource(Resource):
point the ZYTE_API_URL setting to the mock server. See
``tests/test_ecommerce.py::test_crawl_strategies`` for an example.
This mock server is designed to fake a website with the following pages:
```
https://example.com/
https://example.com/page/2
https://example.com/category/1
https://example.com/category/1/page/2
https://example.com/non-navigation
```
When browserHtml is requested (for any URL, listed above or not), it is
a minimal HTML with an anchor tag pointing to
https://example.com/non-navigation.
When productNavigation is requested, nextPage and subCategories are filled
accordingly. productNavigation.items always has 2 product URLs, which are
the result of appending ``/product/<n>`` to the request URL.
https://example.com/non-navigation is not reachable through
productNavigation.
When product or productList is requested, an item with the current URL is
always returned.
All output also includes unsupported links (mailto:…).
This mock server is designed to fake the following:
- An e-commerce website with the following pages:
```
https://example.com/
https://example.com/page/2
https://example.com/category/1
https://example.com/category/1/page/2
https://example.com/non-navigation
```
When browserHtml is requested (for any URL, listed above or not), it is
a minimal HTML with an anchor tag pointing to
https://example.com/non-navigation.
When productNavigation is requested, nextPage and subCategories are filled
accordingly. productNavigation.items always has 2 product URLs, which are
the result of appending ``/product/<n>`` to the request URL.
https://example.com/non-navigation is not reachable through
productNavigation.
When product or productList is requested, an item with the current URL is
always returned.
All output also includes unsupported links (mailto:…).
- Job-posting websites with the following endpoints:
- https://jobs.example (jobPostingNavigation pointing to the 2 items
below).
- https://jobs.offsite.example/jobs/1 (jobPosting)
- https://jobs.offsite.example/jobs/2 (jobPosting)
"""

def getChild(self, path, request):
Expand All @@ -70,6 +81,24 @@ def render_POST(self, request):

response_data["url"] = request_data["url"]

if request_data["url"] == "https://jobs.example":
assert request_data["jobPostingNavigation"] is True
response_data["jobPostingNavigation"] = {
"url": request_data["url"],
"items": [
{"url": "https://jobs.offsite.example/jobs/1"},
{"url": "https://jobs.offsite.example/jobs/2"},
],
}
return json.dumps(response_data).encode()

if request_data["url"].startswith("https://jobs.offsite.example/"):
assert request_data["jobPosting"] is True
response_data["jobPosting"] = {
"url": request_data["url"],
}
return json.dumps(response_data).encode()

non_navigation_url = "https://example.com/non-navigation"
html = f"""<html><body><a href="{non_navigation_url}"></a><a href="mailto:[email protected]"></a></body></html>"""
if request_data.get("browserHtml", False) is True:
Expand Down
16 changes: 10 additions & 6 deletions tests/test_addon.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,14 @@
BASELINE_SETTINGS = _crawler.settings.copy_to_dict()

try:
from scrapy.downloadermiddlewares.offsite import OffsiteMiddleware
from scrapy.downloadermiddlewares.offsite import OffsiteMiddleware # noqa: F401
except ImportError:
from scrapy.spidermiddlewares.offsite import ( # type: ignore[assignment]
OffsiteMiddleware,
BUILTIN_OFFSITE_MIDDLEWARE_IMPORT_PATH = (
"scrapy.spidermiddlewares.offsite.OffsiteMiddleware"
)
else:
BUILTIN_OFFSITE_MIDDLEWARE_IMPORT_PATH = (
"scrapy.downloadermiddlewares.offsite.OffsiteMiddleware"
)


Expand Down Expand Up @@ -67,8 +71,8 @@ def _test_setting_changes(initial_settings, expected_settings):
"CLOSESPIDER_TIMEOUT_NO_ITEM": 600,
"DOWNLOADER_MIDDLEWARES": {
MaxRequestsPerSeedDownloaderMiddleware: 100,
OffsiteMiddleware: None,
AllowOffsiteMiddleware: 500,
BUILTIN_OFFSITE_MIDDLEWARE_IMPORT_PATH: None,
AllowOffsiteMiddleware: 50,
},
"SCHEDULER_DISK_QUEUE": "scrapy.squeues.PickleFifoDiskQueue",
"SCHEDULER_MEMORY_QUEUE": "scrapy.squeues.FifoMemoryQueue",
Expand Down Expand Up @@ -128,7 +132,7 @@ def test_poet_setting_changes_since_scrapy_2_11_2(initial_settings, expected_set
OffsiteRequestsPerSeedMiddleware: 49,
OnlyFeedsMiddleware: 108,
TrackNavigationDepthSpiderMiddleware: 110,
OffsiteMiddleware: None,
BUILTIN_OFFSITE_MIDDLEWARE_IMPORT_PATH: None,
AllowOffsiteMiddleware: 500,
TrackSeedsSpiderMiddleware: 550,
CrawlingLogsMiddleware: 1000,
Expand Down
26 changes: 26 additions & 0 deletions tests/test_job_posting.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import requests
import scrapy
from pydantic import ValidationError
from pytest_twisted import ensureDeferred
from scrapy import signals
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import get_spider_metadata
from zyte_common_items import JobPosting, JobPostingNavigation, ProbabilityRequest
Expand Down Expand Up @@ -602,3 +604,27 @@ def test_urls_file():
assert start_requests[0].url == "https://a.example"
assert start_requests[1].url == "https://b.example"
assert start_requests[2].url == "https://c.example"


@ensureDeferred
async def test_offsite(mockserver):
settings = {
"ZYTE_API_URL": mockserver.urljoin("/"),
"ZYTE_API_KEY": "a",
"ADDONS": {
"scrapy_zyte_api.Addon": 500,
"zyte_spider_templates.Addon": 1000,
},
}
crawler = get_crawler(settings=settings, spider_cls=JobPostingSpider)
actual_output = set()

def track_item(item, response, spider):
actual_output.add(item.url)

crawler.signals.connect(track_item, signal=signals.item_scraped)
await crawler.crawl(url="https://jobs.example")
assert actual_output == {
"https://jobs.offsite.example/jobs/1",
"https://jobs.offsite.example/jobs/2",
}
28 changes: 28 additions & 0 deletions tests/test_serp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from urllib.parse import quote_plus

import pytest
from pydantic import ValidationError
from scrapy import Request
from scrapy_spider_metadata import get_spider_metadata
from scrapy_zyte_api.responses import ZyteAPITextResponse
Expand Down Expand Up @@ -288,6 +289,7 @@ def test_metadata():
{"type": "null"},
],
"description": "Input 1 search query per line (e.g. foo bar).",
"pattern": r"(.|\r?\n)*\S+(.|\r?\n)*",
"title": "Search Queries",
"widget": "textarea",
},
Expand Down Expand Up @@ -764,3 +766,29 @@ def test_item_type_mappings():

# Also ensure that no dict value is repeated.
assert len(actual_keys) == len(set(ITEM_TYPE_CLASSES.values()))


@pytest.mark.parametrize(
"input_data,raises",
[
({"search_queries": "foo"}, False),
({"search_queries": "foo "}, False),
({"search_queries": " foo "}, False),
({"search_queries": " fo o "}, False),
({"search_queries": "fo o"}, False),
({"search_queries": "fo\n o "}, False),
({"search_queries": ["fo", " o "]}, False),
({"search_queries": ["fo", " "]}, False),
({"search_queries": " "}, True),
({"search_queries": ""}, True),
({"search_queries": " "}, True),
({"search_queries": " \n "}, True),
({"search_queries": [" ", " "]}, True),
],
)
def test_query_validation(input_data, raises):
if raises:
with pytest.raises(ValidationError):
GoogleSearchSpider(**input_data)
else:
GoogleSearchSpider(**input_data)
2 changes: 1 addition & 1 deletion utils/google-gl-updater/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ cssselect==1.2.0
# via parsel
idna==3.10
# via requests
jinja2==3.1.4
jinja2==3.1.5
# via -r requirements.in
jmespath==1.0.1
# via parsel
Expand Down
2 changes: 1 addition & 1 deletion utils/google-hl-updater/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ cssselect==1.2.0
# via parsel
idna==3.10
# via requests
jinja2==3.1.4
jinja2==3.1.5
# via -r requirements.in
jmespath==1.0.1
# via parsel
Expand Down
6 changes: 2 additions & 4 deletions zyte_spider_templates/_addon.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ def _replace_builtin(

builtin_entry: Optional[Any] = None
for _setting_value in (setting_value, settings[f"{setting}_BASE"]):
if builtin_cls in setting_value:
if builtin_cls in _setting_value:
builtin_entry = builtin_cls
pos = _setting_value[builtin_entry]
break
for cls_or_path in setting_value:
for cls_or_path in _setting_value:
if isinstance(cls_or_path, str):
_cls = load_object(cls_or_path)
if _cls == builtin_cls:
Expand Down Expand Up @@ -151,15 +151,13 @@ def update_settings(self, settings: BaseSettings) -> None:
OffsiteMiddleware,
)

_setdefault(settings, "SPIDER_MIDDLEWARES", OffsiteMiddleware, 500)
_replace_builtin(
settings,
"SPIDER_MIDDLEWARES",
OffsiteMiddleware,
AllowOffsiteMiddleware,
)
else:
_setdefault(settings, "DOWNLOADER_MIDDLEWARES", OffsiteMiddleware, 500)
_replace_builtin(
settings,
"DOWNLOADER_MIDDLEWARES",
Expand Down
5 changes: 1 addition & 4 deletions zyte_spider_templates/_incremental/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,7 @@ class IncrementalCrawlMiddleware:
def __init__(self, crawler: Crawler):
assert crawler.spider
if not crawler.spider.settings.getbool("INCREMENTAL_CRAWL_ENABLED", False):
raise NotConfigured(
"IncrementalCrawlMiddleware is not enabled. Set the "
"INCREMENTAL_CRAWL_ENABLED setting to True to enable it."
)
raise NotConfigured
self.inc_manager: IncrementalCrawlingManager = self.prepare_incremental_manager(
crawler
)
Expand Down
Loading

0 comments on commit 43cfa38

Please sign in to comment.