Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into param-descrs
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR committed Oct 11, 2024
2 parents e4fed50 + d01ac8d commit 89c5cf8
Show file tree
Hide file tree
Showing 26 changed files with 1,123 additions and 224 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.8.0
current_version = 0.9.0
commit = True
tag = True
tag_name = {new_version}
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@ jobs:
fail-fast: false
matrix:
include:
- python-version: '3.8'
- python-version: '3.9'
toxenv: min
- python-version: '3.8'
- python-version: '3.9'
- python-version: '3.10'
- python-version: '3.11'
Expand Down
25 changes: 25 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
Changes
=======

0.9.0 (2024-09-17)
------------------

* Now requires ``zyte-common-items >= 0.22.0``.

* New :ref:`Google Search spider template <google-search>`, built on top of
Zyte API’s :http:`request:serp`.

* The heuristics of the :ref:`e-commerce spider template <e-commerce>` to
ignore certain URLs when following category links now also handles
subdomains. For example, before https://example.com/blog was ignored, now
https://blog.example.com is also ignored.

* In the :ref:`spider parameters JSON schema <params-schema>`, the
:class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.crawl_strategy`
parameter of the :ref:`e-commerce spider template <e-commerce>` switches
position, from being the last parameter to being between
:class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls_file`
and
:class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.geolocation`.

* Removed the ``valid_page_types`` attribute of
:class:`zyte_spider_templates.middlewares.CrawlingLogsMiddleware`.


0.8.0 (2024-08-21)
------------------

Expand Down
41 changes: 41 additions & 0 deletions docs/_ext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,45 @@
import re

from docutils import nodes
from docutils.parsers.rst.roles import set_classes


def http_api_reference_role(
name, rawtext, text, lineno, inliner, options={}, content=[]
):
match = re.search(
r"(?s)^(.+?)\s*<\s*((?:request|response):[a-zA-Z.]+)\s*>\s*$", text
)
if match:
display_text = match[1]
reference = match[2]
else:
display_text = None
reference = text
if reference.startswith("request:"):
request_or_response = "request"
elif reference.startswith("response:"):
request_or_response = "response/200"
else:
raise ValueError(
f":http: directive reference must start with request: or "
f"response:, got {reference} from {text!r}."
)

field = reference.split(":", maxsplit=1)[1]
if not display_text:
display_text = field
refuri = (
f"https://docs.zyte.com/zyte-api/usage/reference.html"
f"#operation/extract/{request_or_response}/{field}"
)
set_classes(options)
node = nodes.reference(rawtext, display_text, refuri=refuri, **options)
return [node], []


def setup(app):
app.add_role("http", http_api_reference_role)
# https://stackoverflow.com/a/13663325
#
# Scrapy’s
Expand Down
9 changes: 8 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
project = "zyte-spider-templates"
copyright = "2023, Zyte Group Ltd"
author = "Zyte Group Ltd"
release = "0.8.0"
release = "0.9.0"

sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext
extensions = [
Expand Down Expand Up @@ -34,6 +34,10 @@
"https://scrapy-poet.readthedocs.io/en/stable",
None,
),
"scrapy-spider-metadata": (
"https://scrapy-spider-metadata.readthedocs.io/en/latest",
None,
),
"scrapy-zyte-api": (
"https://scrapy-zyte-api.readthedocs.io/en/stable",
None,
Expand All @@ -48,8 +52,11 @@
),
}

autodoc_pydantic_model_show_config_summary = False
autodoc_pydantic_model_show_field_summary = False
autodoc_pydantic_model_show_json = False
autodoc_pydantic_model_show_validator_members = False
autodoc_pydantic_model_show_validator_summary = False

# sphinx-reredirects
redirects = {
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ zyte-spider-templates documentation

templates/index
E-commerce <templates/e-commerce>
Google search <templates/google-search>

.. toctree::
:caption: Customization
Expand Down
5 changes: 5 additions & 0 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Spiders

.. autoclass:: zyte_spider_templates.EcommerceSpider

.. autoclass:: zyte_spider_templates.GoogleSearchSpider


Pages
=====
Expand Down Expand Up @@ -41,3 +43,6 @@ Parameter mixins
:exclude-members: model_computed_fields

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy

.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam
:exclude-members: model_computed_fields
2 changes: 1 addition & 1 deletion docs/setup.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ configured on an existing Scrapy_ project.
Requirements
============

- Python 3.8+
- Python 3.9+

- Scrapy 2.11+

Expand Down
19 changes: 19 additions & 0 deletions docs/templates/google-search.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
.. _google-search:

=================================================
Google search spider template (``google_search``)
=================================================

Basic use
=========

.. code-block:: shell
scrapy crawl google_search -a search_queries="foo bar"
Parameters
==========

.. autopydantic_model:: zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams
:inherited-members: BaseModel
:exclude-members: model_computed_fields
3 changes: 3 additions & 0 deletions docs/templates/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,6 @@ Spider template list

:ref:`E-commerce <e-commerce>`
Get products from an e-commerce website.

:ref:`Google Search <google-search>`
Get Google search results.
13 changes: 6 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name="zyte-spider-templates",
version="0.8.0",
version="0.9.0",
description="Spider templates for automatic crawlers.",
long_description=open("README.rst").read(),
long_description_content_type="text/x-rst",
Expand All @@ -12,21 +12,20 @@
packages=find_packages(),
include_package_data=True,
install_requires=[
"pydantic>=2",
"pydantic>=2.1",
"requests>=0.10.1",
"scrapy>=2.11.0",
"scrapy-poet>=0.21.0",
"scrapy-spider-metadata>=0.1.2",
"scrapy-zyte-api[provider]>=0.16.0",
"zyte-common-items>=0.13.0",
"scrapy-poet>=0.24.0",
"scrapy-spider-metadata>=0.2.0",
"scrapy-zyte-api[provider]>=0.23.0",
"zyte-common-items>=0.23.0",
],
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
Expand Down
4 changes: 4 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from typing import Any, Dict, Optional

import pytest
from scrapy.utils.test import TestSpider

# https://docs.pytest.org/en/stable/how-to/writing_plugins.html#assertion-rewriting
pytest.register_assert_rewrite("tests.utils")


# scrapy.utils.test.get_crawler alternative that does not freeze settings.
def get_crawler(*, settings: Optional[Dict[str, Any]] = None):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@

def test_deprecation():
with pytest.deprecated_call(match="^BaseSpiderParams is deprecated.*"):
BaseSpiderParams(url="https://example.com")
BaseSpiderParams(url="https://example.com") # type: ignore[call-arg]
Loading

0 comments on commit 89c5cf8

Please sign in to comment.