Skip to content

Commit

Permalink
Rely on centralized request fingerprints on Scrapy 2.7 and higher
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Feb 19, 2024
1 parent b27e8e9 commit 780ff8c
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 6 deletions.
36 changes: 33 additions & 3 deletions sh_scrapy/extension.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
from contextlib import suppress
from warnings import warn
from weakref import WeakKeyDictionary

import scrapy
Expand All @@ -9,7 +10,6 @@
from scrapy.exporters import PythonItemExporter
from scrapy.http import Request
from scrapy.utils.deprecate import create_deprecated_class
from scrapy.utils.request import request_fingerprint

from sh_scrapy import hsref
from sh_scrapy.crawl import ignore_warnings
Expand Down Expand Up @@ -79,7 +79,37 @@ def spider_closed(self, spider, reason):
"""


class HubstorageMiddleware(object):
class HubstorageMiddleware:

@classmethod
def from_crawler(cls, crawler):
try:
result = cls(crawler)
except TypeError:
warn(
(
"Subclasses of HubstorageMiddleware must now accept a "
"crawler parameter in their __init__ method. This will "
"become an error in the future."
),
DeprecationWarning,
)
result = cls()
result._crawler = crawler
result._load_fingerprinter()
return result

def __init__(self, crawler=None):
self._crawler = crawler
if crawler:
self._load_fingerprinter()

Check warning on line 105 in sh_scrapy/extension.py

View check run for this annotation

Codecov / codecov/patch

sh_scrapy/extension.py#L103-L105

Added lines #L103 - L105 were not covered by tests

def _load_fingerprinter(self):
if hasattr(self._crawler, "request_fingerprinter"):
self._fingerprint = lambda request: self._crawler.request_fingerprinter.fingerprint(request).hex()
else:
from scrapy.utils.request import request_fingerprint
self._fingerprint = request_fingerprint

def __init__(self):
self._seen = WeakKeyDictionary()
Expand All @@ -95,7 +125,7 @@ def process_spider_input(self, response, spider):
rs=len(response.body),
duration=response.meta.get('download_latency', 0) * 1000,
parent=response.meta.get(HS_PARENT_ID_KEY),
fp=request_fingerprint(response.request),
fp=self._fingerprint(response.request),
)
self._seen[response] = next(self.request_id_sequence)

Expand Down
32 changes: 29 additions & 3 deletions tests/test_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@

import mock
import pytest
import scrapy
from packaging import version
from scrapy import Spider
from scrapy.exporters import PythonItemExporter
from scrapy.http import Request, Response
from scrapy.item import Item
from scrapy.utils.request import request_fingerprint
from scrapy.utils.test import get_crawler

from sh_scrapy.extension import HubstorageExtension, HubstorageMiddleware
Expand Down Expand Up @@ -92,7 +93,8 @@ def test_hs_ext_spider_closed(hs_ext):
@pytest.fixture
def hs_mware(monkeypatch):
monkeypatch.setattr('sh_scrapy.extension.pipe_writer', mock.Mock())
return HubstorageMiddleware()
crawler = get_crawler()
return HubstorageMiddleware.from_crawler(crawler)


def test_hs_mware_init(hs_mware):
Expand All @@ -106,9 +108,13 @@ def test_hs_mware_process_spider_input(hs_mware):
hs_mware.process_spider_input(response, Spider('test'))
assert hs_mware.pipe_writer.write_request.call_count == 1
args = hs_mware.pipe_writer.write_request.call_args[1]
if hasattr(hs_mware._crawler, "request_fingerprinter"):
fp = "1c735665b072000e11b0169081bce5bbaeac09a7"
else:
fp = "a001a1eb4537acdc8525edf1250065cab2657152"
assert args == {
'duration': 0,
'fp': request_fingerprint(response.request),
'fp': fp,
'method': 'GET',
'parent': None,
'rs': 0,
Expand Down Expand Up @@ -138,3 +144,23 @@ def test_hs_mware_process_spider_output_filter_request(hs_mware):
# make sure that we update hsparent meta only for requests
assert result[0].meta.get(HS_PARENT_ID_KEY) is None
assert result[1].meta[HS_PARENT_ID_KEY] == 'riq'


@pytest.mark.skipif(
version.parse(scrapy.__version__) < version.parse("2.7"),
reason="Only Scrapy 2.7 and higher support centralized request fingerprints."
)
def test_custom_fingerprinter(monkeypatch):
monkeypatch.setattr('sh_scrapy.extension.pipe_writer', mock.Mock())

class CustomFingerprinter:
def fingerprint(self, request):
return b"foo"

crawler = get_crawler(settings_dict={"REQUEST_FINGERPRINTER_CLASS": CustomFingerprinter})
mw = HubstorageMiddleware.from_crawler(crawler)

response = Response('http://resp-url')
response.request = Request('http://req-url')
mw.process_spider_input(response, Spider('test'))
assert mw.pipe_writer.write_request.call_args[1]["fp"] == b"foo".hex()
3 changes: 3 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# tox.ini
[tox]
envlist = py36-scrapy16, py
requires =
# https://github.com/pypa/virtualenv/issues/2550
virtualenv<=20.21.1

[testenv]
deps =
Expand Down

0 comments on commit 780ff8c

Please sign in to comment.