Skip to content

Commit

Permalink
tmp: trace
Browse files Browse the repository at this point in the history
  • Loading branch information
BurnzZ committed Jan 22, 2024
1 parent b27e8e9 commit 517b811
Showing 1 changed file with 12 additions and 6 deletions.
18 changes: 12 additions & 6 deletions sh_scrapy/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,35 +15,38 @@

class HubstorageSpiderMiddleware(object):
"""Hubstorage spider middleware.
What it does:
- Sets parent request ids to the requests coming out of the spider.
"""

def __init__(self):
self._seen_requests = seen_requests

def process_spider_output(self, response, result, spider):
parent = self._seen_requests.pop(response.request, None)
print(f"[SpiderMw] parent={parent}")
for x in result:
print(f"[SpiderMw] result={x}")
if isinstance(x, Request):
x.meta[HS_PARENT_ID_KEY] = parent
# Remove request id if it was for some reason set in the request coming from Spider.
x.meta.pop(HS_REQUEST_ID_KEY, None)
print(f"[SpiderMw] x.meta={x.meta}")
yield x


class HubstorageDownloaderMiddleware(object):
"""Hubstorage dowloader middleware.
What it does:
- Generates request ids for all downloaded requests.
- Sets parent request ids for requests generated in downloader middlewares.
- Stores all downloaded requests into Hubstorage.
"""

def __init__(self):
Expand All @@ -60,9 +63,12 @@ def process_request(self, request, spider):
request.meta[HS_PARENT_ID_KEY] = request_id

def process_response(self, request, response, spider):
print(f"[DownloaderMw] request={request} parent={request.meta.setdefault(HS_PARENT_ID_KEY)}")

# This class of response check is intended to fix the bug described here
# https://github.com/scrapy-plugins/scrapy-zyte-api/issues/112
if type(response).__name__ == "DummyResponse" and type(response).__module__.startswith("scrapy_poet"):
print(f"[DownloaderMw] skip")
return response

self.pipe_writer.write_request(
Expand Down

0 comments on commit 517b811

Please sign in to comment.