From ffbe2b72e851a927f93266418709634627d5d568 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 26 Sep 2023 13:43:40 +0400 Subject: [PATCH 1/7] scrapy-spider-metadata support. --- setup.py | 1 + sh_scrapy/commands/shub_image_info.py | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f84b754..f79a705 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ install_requires=[ 'Scrapy>=1.6', 'scrapinghub>=2.1.0', + 'scrapy-spider-metadata; python_version >= "3.8"' ], entry_points={ 'console_scripts': [ diff --git a/sh_scrapy/commands/shub_image_info.py b/sh_scrapy/commands/shub_image_info.py index 844cfb2..de0160e 100644 --- a/sh_scrapy/commands/shub_image_info.py +++ b/sh_scrapy/commands/shub_image_info.py @@ -2,6 +2,7 @@ from __future__ import print_function import json import subprocess +import sys from scrapy.commands import ScrapyCommand @@ -35,8 +36,21 @@ def add_options(self, parser): def run(self, args, opts): result = { 'project_type': 'scrapy', - 'spiders': sorted(self.crawler_process.spider_loader.list()) + 'spiders': sorted(self.crawler_process.spider_loader.list()), + 'metadata': {}, } + if sys.version_info >= (3, 8): + # scrapy-spider-metadata requires Python 3.8+ + from scrapy_spider_metadata import get_metadata_for_spider + for spider_name in result['spiders']: + spider_cls = self.crawler_process.spider_loader.load(spider_name) + metadata_dict = get_metadata_for_spider(spider_cls) + try: + # make sure it's serializable + json.dumps(metadata_dict) + except (TypeError, ValueError): + continue + result['metadata'] = metadata_dict if opts.debug: output = subprocess.check_output( ['bash', '-c', self.IMAGE_INFO_CMD], From 07df6eb7ef510619b1662276e9cc540dd721e080 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 26 Sep 2023 13:51:57 +0400 Subject: [PATCH 2/7] Fixes. --- sh_scrapy/commands/shub_image_info.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sh_scrapy/commands/shub_image_info.py b/sh_scrapy/commands/shub_image_info.py index de0160e..f79d394 100644 --- a/sh_scrapy/commands/shub_image_info.py +++ b/sh_scrapy/commands/shub_image_info.py @@ -37,11 +37,10 @@ def run(self, args, opts): result = { 'project_type': 'scrapy', 'spiders': sorted(self.crawler_process.spider_loader.list()), - 'metadata': {}, } - if sys.version_info >= (3, 8): - # scrapy-spider-metadata requires Python 3.8+ + try: from scrapy_spider_metadata import get_metadata_for_spider + result['metadata'] = {} for spider_name in result['spiders']: spider_cls = self.crawler_process.spider_loader.load(spider_name) metadata_dict = get_metadata_for_spider(spider_cls) @@ -50,7 +49,9 @@ def run(self, args, opts): json.dumps(metadata_dict) except (TypeError, ValueError): continue - result['metadata'] = metadata_dict + result['metadata'][spider_name] = metadata_dict + except ImportError: + pass if opts.debug: output = subprocess.check_output( ['bash', '-c', self.IMAGE_INFO_CMD], From 1c1608eadd2042113038af1c6bfd8f75c8d1642b Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 26 Sep 2023 16:57:29 +0400 Subject: [PATCH 3/7] get_metadata_for_spider was renamed. --- sh_scrapy/commands/shub_image_info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sh_scrapy/commands/shub_image_info.py b/sh_scrapy/commands/shub_image_info.py index f79d394..4ab1856 100644 --- a/sh_scrapy/commands/shub_image_info.py +++ b/sh_scrapy/commands/shub_image_info.py @@ -39,11 +39,11 @@ def run(self, args, opts): 'spiders': sorted(self.crawler_process.spider_loader.list()), } try: - from scrapy_spider_metadata import get_metadata_for_spider + from scrapy_spider_metadata import get_spider_metadata result['metadata'] = {} for spider_name in result['spiders']: spider_cls = self.crawler_process.spider_loader.load(spider_name) - metadata_dict = get_metadata_for_spider(spider_cls) + metadata_dict = get_spider_metadata(spider_cls) try: # make sure it's serializable json.dumps(metadata_dict) From 7e3da4fc1c64560ea2b022052d5779c2151fc3d1 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 26 Sep 2023 17:38:13 +0400 Subject: [PATCH 4/7] Add tests for shub-image-info. --- tests/test_crawl.py | 50 +++++++++++++++++++++++++++++++++++++++++++-- tests/utils.py | 36 ++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 tests/utils.py diff --git a/tests/test_crawl.py b/tests/test_crawl.py index e382b52..6492c36 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -3,9 +3,7 @@ import json import mock import pytest -import warnings from scrapy.settings import Settings -from scrapy.exceptions import ScrapyDeprecationWarning import sh_scrapy.crawl from sh_scrapy.crawl import _fatalerror @@ -18,6 +16,7 @@ from sh_scrapy.crawl import list_spiders from sh_scrapy.crawl import main from sh_scrapy.log import HubstorageLogHandler +from tests.utils import create_project, call_command @mock.patch.dict(os.environ, {'HWORKER_SENTRY_DSN': 'hw-sentry-dsn', @@ -281,3 +280,50 @@ def test_main(mocked_launch, pipe_writer): # This ensures that pipe is writable even if main program is fininshed - # e.g. for threads that are not closed yet. assert not pipe_writer.close.called + + +def test_image_info(tmp_path): + project_dir = create_project(tmp_path) + out, err = call_command(project_dir, "shub-image-info") + # can't be asserted as it contains a SHScrapyDeprecationWarning + # assert err == "" + data = json.loads(out) + assert data == { + "project_type": "scrapy", + "spiders": ["myspider"], + "metadata": {"myspider": {}}, + } + + +def test_image_info_metadata(tmp_path): + project_dir = create_project(tmp_path, spider_text=""" +from scrapy import Spider + +class MySpider(Spider): + name = "myspider" + metadata = {"foo": 42} +""") + out, _ = call_command(project_dir, "shub-image-info") + data = json.loads(out) + assert data == { + "project_type": "scrapy", + "spiders": ["myspider"], + "metadata": {"myspider": {"foo": 42}}, + } + + +def test_image_info_metadata_skip_broken(tmp_path): + project_dir = create_project(tmp_path, spider_text=""" +from scrapy import Spider + +class MySpider(Spider): + name = "myspider" + metadata = {"foo": Spider} +""") + out, _ = call_command(project_dir, "shub-image-info") + data = json.loads(out) + assert data == { + "project_type": "scrapy", + "spiders": ["myspider"], + "metadata": {}, + } diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..8e320dd --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,36 @@ +import os +import subprocess +import sys +from pathlib import Path +from typing import Tuple, Optional, Union + + +def call_command(cwd: Union[str, os.PathLike], *args: str) -> Tuple[str, str]: + result = subprocess.run( + args, + cwd=str(cwd), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) + assert result.returncode == 0, result.stderr + return result.stdout, result.stderr + + +def call_scrapy_command(cwd: Union[str, os.PathLike], *args: str) -> Tuple[str, str]: + args = (sys.executable, "-m", "scrapy.cmdline") + args + return call_command(cwd, *args) + + +def create_project(topdir: Path, spider_text: Optional[str] = None) -> Path: + project_name = "foo" + cwd = topdir + call_scrapy_command(str(cwd), "startproject", project_name) + cwd /= project_name + (cwd / project_name / "spiders" / "spider.py").write_text(spider_text or """ +from scrapy import Spider + +class MySpider(Spider): + name = "myspider" +""") + return cwd From 0a86dc208a751578c0094d6aca8b74a0db8511e7 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 26 Sep 2023 18:02:21 +0400 Subject: [PATCH 5/7] Fix tests without scrapy-spider-metadata. --- tests/test_crawl.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/test_crawl.py b/tests/test_crawl.py index 6492c36..10f8efe 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -19,6 +19,13 @@ from tests.utils import create_project, call_command +try: + from scrapy_spider_metadata import get_spider_metadata + SPIDER_METADATA_AVAILABLE = True +except: + SPIDER_METADATA_AVAILABLE = False + + @mock.patch.dict(os.environ, {'HWORKER_SENTRY_DSN': 'hw-sentry-dsn', 'SENTRY_DSN': 'sentry-dsn'}) def test_init_module(): @@ -288,13 +295,17 @@ def test_image_info(tmp_path): # can't be asserted as it contains a SHScrapyDeprecationWarning # assert err == "" data = json.loads(out) - assert data == { + expected = { "project_type": "scrapy", "spiders": ["myspider"], "metadata": {"myspider": {}}, } + if not SPIDER_METADATA_AVAILABLE: + del expected["metadata"] + assert data == expected +@pytest.mark.skipif(not SPIDER_METADATA_AVAILABLE, reason="scrapy-spider-metadata is not installed") def test_image_info_metadata(tmp_path): project_dir = create_project(tmp_path, spider_text=""" from scrapy import Spider @@ -305,13 +316,15 @@ class MySpider(Spider): """) out, _ = call_command(project_dir, "shub-image-info") data = json.loads(out) - assert data == { + expected = { "project_type": "scrapy", "spiders": ["myspider"], "metadata": {"myspider": {"foo": 42}}, } + assert data == expected +@pytest.mark.skipif(not SPIDER_METADATA_AVAILABLE, reason="scrapy-spider-metadata is not installed") def test_image_info_metadata_skip_broken(tmp_path): project_dir = create_project(tmp_path, spider_text=""" from scrapy import Spider @@ -322,8 +335,9 @@ class MySpider(Spider): """) out, _ = call_command(project_dir, "shub-image-info") data = json.loads(out) - assert data == { + expected = { "project_type": "scrapy", "spiders": ["myspider"], "metadata": {}, } + assert data == expected From 71f2b8c0ed447e8985ea8dc0fda3e70e715a0ebd Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Wed, 27 Sep 2023 16:02:43 +0400 Subject: [PATCH 6/7] Remove scraoy-spider-metadata from requirements. --- setup.py | 1 - tests/test_crawl.py | 6 ++++-- tox.ini | 2 ++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index f79a705..f84b754 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,6 @@ install_requires=[ 'Scrapy>=1.6', 'scrapinghub>=2.1.0', - 'scrapy-spider-metadata; python_version >= "3.8"' ], entry_points={ 'console_scripts': [ diff --git a/tests/test_crawl.py b/tests/test_crawl.py index 10f8efe..329e4c6 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -305,7 +305,6 @@ def test_image_info(tmp_path): assert data == expected -@pytest.mark.skipif(not SPIDER_METADATA_AVAILABLE, reason="scrapy-spider-metadata is not installed") def test_image_info_metadata(tmp_path): project_dir = create_project(tmp_path, spider_text=""" from scrapy import Spider @@ -321,10 +320,11 @@ class MySpider(Spider): "spiders": ["myspider"], "metadata": {"myspider": {"foo": 42}}, } + if not SPIDER_METADATA_AVAILABLE: + del expected["metadata"] assert data == expected -@pytest.mark.skipif(not SPIDER_METADATA_AVAILABLE, reason="scrapy-spider-metadata is not installed") def test_image_info_metadata_skip_broken(tmp_path): project_dir = create_project(tmp_path, spider_text=""" from scrapy import Spider @@ -340,4 +340,6 @@ class MySpider(Spider): "spiders": ["myspider"], "metadata": {}, } + if not SPIDER_METADATA_AVAILABLE: + del expected["metadata"] assert data == expected diff --git a/tox.ini b/tox.ini index 49682e8..c1b6641 100644 --- a/tox.ini +++ b/tox.ini @@ -10,5 +10,7 @@ deps = hubstorage packaging py36-scrapy16: Scrapy==1.6 + scrapy-spider-metadata; python_version >= "3.8" + commands = pytest --verbose --cov=sh_scrapy --cov-report=term-missing --cov-report=html --cov-report=xml {posargs: sh_scrapy tests} From 4d4c5ac8818d592b25fc05d9ea003b20829fad22 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Wed, 27 Sep 2023 16:27:04 +0400 Subject: [PATCH 7/7] Cleanup. --- sh_scrapy/commands/shub_image_info.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sh_scrapy/commands/shub_image_info.py b/sh_scrapy/commands/shub_image_info.py index 4ab1856..70bd3db 100644 --- a/sh_scrapy/commands/shub_image_info.py +++ b/sh_scrapy/commands/shub_image_info.py @@ -2,7 +2,6 @@ from __future__ import print_function import json import subprocess -import sys from scrapy.commands import ScrapyCommand @@ -40,6 +39,9 @@ def run(self, args, opts): } try: from scrapy_spider_metadata import get_spider_metadata + except ImportError: + pass + else: result['metadata'] = {} for spider_name in result['spiders']: spider_cls = self.crawler_process.spider_loader.load(spider_name) @@ -50,8 +52,6 @@ def run(self, args, opts): except (TypeError, ValueError): continue result['metadata'][spider_name] = metadata_dict - except ImportError: - pass if opts.debug: output = subprocess.check_output( ['bash', '-c', self.IMAGE_INFO_CMD],