diff --git a/setup.py b/setup.py index 64532cb..1fbe099 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ "scrapy-poet>=0.24.0", "scrapy-spider-metadata>=0.2.0", "scrapy-zyte-api[provider]>=0.24.0", + "text-unidecode>=1.3.0", "web-poet>=0.17.1", "xtractmime>=0.2.1", "zyte-common-items>=0.26.2", diff --git a/tests/__init__.py b/tests/__init__.py index df7b124..e57b600 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,3 +1,5 @@ +import contextlib +import os from typing import Any, Dict, Optional, Type import pytest @@ -20,3 +22,15 @@ def get_crawler( runner = CrawlerRunner(settings) crawler = runner.create_crawler(spider_cls) return crawler + + +# https://stackoverflow.com/a/34333710 +@contextlib.contextmanager +def set_env(**environ): + old_environ = dict(os.environ) + os.environ.update(environ) + try: + yield + finally: + os.environ.clear() + os.environ.update(old_environ) diff --git a/tests/incremental/__init__.py b/tests/incremental/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/incremental/test_collection_fp_manager.py b/tests/incremental/test_collection_fp_manager.py index c6cdcf2..8823f00 100644 --- a/tests/incremental/test_collection_fp_manager.py +++ b/tests/incremental/test_collection_fp_manager.py @@ -1,15 +1,22 @@ from asyncio import ensure_future +from logging import WARNING from unittest.mock import MagicMock, patch import pytest +from scrapy import Spider from scrapy.statscollectors import StatsCollector from scrapy.utils.request import RequestFingerprinter +from scrapy.utils.test import get_crawler as _get_crawler from twisted.internet.defer import Deferred, inlineCallbacks -from tests import get_crawler -from zyte_spider_templates._incremental.manager import CollectionsFingerprintsManager +from zyte_spider_templates._incremental.manager import ( + CollectionsFingerprintsManager, + _get_collection_name, +) from zyte_spider_templates.spiders.article import ArticleSpider +from .. import get_crawler, set_env + @pytest.fixture def mock_crawler(): @@ -207,3 +214,67 @@ def test_spider_closed(mock_scrapinghub_client): fp_manager.save_batch = MagicMock(side_effect=fp_manager.save_batch) # type: ignore fp_manager.spider_closed() fp_manager.save_batch.assert_called_once() + + +@pytest.mark.parametrize( + ("env_vars", "settings", "spider_name", "collection_name", "warnings"), + ( + # INCREMENTAL_CRAWL_COLLECTION_NAME > SHUB_VIRTUAL_SPIDER > Spider.name + # INCREMENTAL_CRAWL_COLLECTION_NAME is used as is, others are + # slugified, length-limited and they get an “_incremental” suffix. + ( + {}, + {}, + "a A-1.α" + "a" * 2048, + "a_A_1_a" + "a" * (2048 - len("a_A_1_a_incremental")) + "_incremental", + ["without defining a specific collection name"], + ), + ( + {"SHUB_VIRTUAL_SPIDER": "a A-1.α" + "a" * 2048}, + {}, + "foo", + "a_A_1_a" + "a" * (2048 - len("a_A_1_a_incremental")) + "_incremental", + ["without defining a specific collection name"], + ), + ( + {"SHUB_VIRTUAL_SPIDER": "bar"}, + {"INCREMENTAL_CRAWL_COLLECTION_NAME": "a A-1.α" + "a" * 2048}, + "foo", + "a A-1.α" + "a" * 2048, + [], + ), + ( + {}, + {}, + "a_A_1_a" + "a" * (2048 - len("a_A_1_a_incremental")), + "a_A_1_a" + "a" * (2048 - len("a_A_1_a_incremental")) + "_incremental", + [], + ), + ( + {}, + {}, + "a_A_1_a" + "a" * (2048 - len("a_A_1_a_incremental") + 1), + "a_A_1_a" + "a" * (2048 - len("a_A_1_a_incremental")) + "_incremental", + ["without defining a specific collection name"], + ), + ), +) +def test_collection_name( + env_vars, settings, spider_name, collection_name, warnings, caplog +): + class TestSpider(Spider): + name = spider_name + + crawler = _get_crawler(settings_dict=settings, spidercls=TestSpider) + crawler.spider = TestSpider() + + caplog.clear() + with caplog.at_level(WARNING): + with set_env(**env_vars): + assert _get_collection_name(crawler) == collection_name + + if warnings: + for warning in warnings: + assert warning in caplog.text + else: + assert not caplog.records diff --git a/tox.ini b/tox.ini index 2a65da7..523441f 100644 --- a/tox.ini +++ b/tox.ini @@ -34,6 +34,7 @@ deps = scrapy-poet==0.24.0 scrapy-spider-metadata==0.2.0 scrapy-zyte-api[provider]==0.24.0 + text-unidecode==1.3.0 web-poet==0.17.1 xtractmime==0.2.1 zyte-common-items==0.26.2 diff --git a/zyte_spider_templates/_incremental/manager.py b/zyte_spider_templates/_incremental/manager.py index 6e4ff28..49ca505 100644 --- a/zyte_spider_templates/_incremental/manager.py +++ b/zyte_spider_templates/_incremental/manager.py @@ -1,5 +1,6 @@ import asyncio import logging +import re from collections import defaultdict from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Set, Tuple, Union @@ -10,6 +11,7 @@ from scrapy import signals from scrapy.crawler import Crawler from scrapy.http.request import Request +from text_unidecode import unidecode from zyte_common_items import Item from zyte_spider_templates.utils import ( @@ -22,11 +24,35 @@ logger = logging.getLogger(__name__) INCREMENTAL_SUFFIX = "_incremental" +_MAX_LENGTH = 2048 - len(INCREMENTAL_SUFFIX) COLLECTION_API_URL = "https://storage.scrapinghub.com/collections" THREAD_POOL_EXECUTOR = ThreadPoolExecutor(max_workers=10) +def _get_collection_name(crawler: Crawler) -> str: + if name := crawler.settings.get("INCREMENTAL_CRAWL_COLLECTION_NAME"): + return name + original_spider_name = get_spider_name(crawler) + mangled_spider_name = unidecode(original_spider_name) + mangled_spider_name = re.sub(r"[^a-zA-Z0-9_]", "_", mangled_spider_name) + mangled_spider_name = mangled_spider_name.rstrip("_") + mangled_spider_name = mangled_spider_name[:_MAX_LENGTH] + name = mangled_spider_name + INCREMENTAL_SUFFIX + if mangled_spider_name != original_spider_name: + logger.warning( + f"You enabled incremental crawling without defining a specific " + f"collection name. As a result, the spider name " + f"({original_spider_name!r}) was mangled as " + f"{mangled_spider_name!r} to generate a name for the collection " + f"to use: {name!r}. Consider setting that or a different " + f"collection name explicitly, either through spider parameters " + f"(if supported) or through the INCREMENTAL_CRAWL_COLLECTION_NAME " + f"setting." + ) + return name + + class CollectionsFingerprintsManager: def __init__(self, crawler: Crawler) -> None: self.writer = None @@ -37,7 +63,7 @@ def __init__(self, crawler: Crawler) -> None: self.batch_size = crawler.settings.getint("INCREMENTAL_CRAWL_BATCH_SIZE", 50) project_id = get_project_id(crawler) - collection_name = self.get_collection_name(crawler) + collection_name = _get_collection_name(crawler) self.init_collection(project_id, collection_name) self.api_url = f"{COLLECTION_API_URL}/{project_id}/s/{collection_name}" @@ -51,12 +77,6 @@ def __init__(self, crawler: Crawler) -> None: crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) - def get_collection_name(self, crawler): - return ( - crawler.settings.get("INCREMENTAL_CRAWL_COLLECTION_NAME") - or f"{get_spider_name(crawler)}{INCREMENTAL_SUFFIX}" - ) - def init_collection(self, project_id, collection_name) -> None: client = get_client() collection = client.get_project(project_id).collections.get_store(