Skip to content

Commit

Permalink
Merge pull request #26 from zytedata/refactor-base
Browse files Browse the repository at this point in the history
moved eCommerce specific code from BaseSpider
  • Loading branch information
BurnzZ authored Jan 11, 2024
2 parents ddbb7a0 + 4a6c214 commit 015c590
Show file tree
Hide file tree
Showing 5 changed files with 211 additions and 221 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@ per-file-ignores =

# E731: Ignore "do not assign a lambda expression, use a def" since
# we're using quick shortcuts for the tests
tests/test_base.py:E731
tests/test_ecommerce.py:E731
122 changes: 0 additions & 122 deletions tests/test_base.py

This file was deleted.

115 changes: 114 additions & 1 deletion tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pydantic import ValidationError
from scrapy_poet import DummyResponse
from scrapy_spider_metadata import get_spider_metadata
from zyte_common_items import ProbabilityRequest, Product, ProductNavigation
from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request

from zyte_spider_templates import BaseSpiderParams
from zyte_spider_templates._geolocations import (
Expand Down Expand Up @@ -474,3 +474,116 @@ def test_get_parse_product_request():
request = ProbabilityRequest(url="https://example.com")
scrapy_request = spider.get_parse_product_request(request)
assert scrapy_request.meta.get("allow_offsite") is True


def test_get_subcategory_request():
url = "https://example.com"

# Normal request but with mostly empty values
request = Request(url)
spider = EcommerceSpider(url="https://example.com")
parse_navigation = lambda _: None
spider.parse_navigation = parse_navigation # type: ignore

scrapy_request = spider.get_subcategory_request(request)
assert isinstance(scrapy_request, scrapy.Request)
assert scrapy_request.callback == parse_navigation
assert scrapy_request.priority == 0
assert scrapy_request.meta == {
"page_params": {},
"crawling_logs": {
"name": "",
"probability": None,
"page_type": "subCategories",
},
}

# Non-Heuristics request
request = ProbabilityRequest.from_dict(
{"url": url, "name": "Some request", "metadata": {"probability": 0.98}}
)
spider = EcommerceSpider(url="https://example.com")
parse_navigation = lambda _: None
spider.parse_navigation = parse_navigation # type: ignore
page_params = {"full_domain": "example.com"}

scrapy_request = spider.get_subcategory_request(request, page_params=page_params)
assert isinstance(scrapy_request, scrapy.Request)
assert scrapy_request.callback == parse_navigation
assert scrapy_request.priority == 98
assert scrapy_request.meta == {
"page_params": {},
"crawling_logs": {
"name": "Some request",
"probability": 0.98,
"page_type": "subCategories",
},
}

# Heuristics request
request = ProbabilityRequest.from_dict(
{
"url": url,
"name": "[heuristics] Some request",
"metadata": {"probability": 0.1},
}
)
spider = EcommerceSpider(url="https://example.com")
parse_navigation = lambda _: None
spider.parse_navigation = parse_navigation # type: ignore
page_params = {"full_domain": "example.com"}

scrapy_request = spider.get_subcategory_request(request, page_params=page_params)
assert isinstance(scrapy_request, scrapy.Request)
assert scrapy_request.callback == parse_navigation
assert scrapy_request.priority == 10
assert scrapy_request.meta == {
"page_params": page_params,
"crawling_logs": {
"name": "Some request",
"probability": 0.1,
"page_type": "productNavigation-heuristics",
},
}


def test_get_nextpage_request():
url = "https://example.com"

# Minimal Args
request = Request(url)
spider = EcommerceSpider(url="https://example.com")
parse_navigation = lambda _: None
spider.parse_navigation = parse_navigation # type: ignore

scrapy_request = spider.get_nextpage_request(request)
assert isinstance(scrapy_request, scrapy.Request)
assert scrapy_request.callback == parse_navigation
assert scrapy_request.priority == 100
assert scrapy_request.meta == {
"page_params": {},
"crawling_logs": {"name": "", "probability": None, "page_type": "nextPage"},
}


def test_get_parse_navigation_request():
url = "https://example.com"

# Minimal args
request = Request(url)
spider = EcommerceSpider(url="https://example.com")
parse_navigation = lambda _: None
spider.parse_navigation = parse_navigation # type: ignore

scrapy_request = spider.get_parse_navigation_request(request)
assert isinstance(scrapy_request, scrapy.Request)
assert scrapy_request.callback == parse_navigation
assert scrapy_request.priority == 0
assert scrapy_request.meta == {
"page_params": {},
"crawling_logs": {
"name": "",
"probability": None,
"page_type": "productNavigation",
},
}
96 changes: 1 addition & 95 deletions zyte_spider_templates/spiders/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from importlib.metadata import version
from typing import Any, Callable, Dict, Optional, Union
from typing import Any, Dict, Optional

import scrapy
from pydantic import BaseModel, Field
from scrapy.crawler import Crawler
from scrapy.utils.url import parse_url
from zyte_common_items import ProbabilityRequest, Request

from zyte_spider_templates._geolocations import (
GEOLOCATION_OPTIONS_WITH_CODE,
Expand Down Expand Up @@ -90,96 +89,3 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
priority=ARG_SETTING_PRIORITY,
)
return spider

@staticmethod
def get_parse_navigation_request_priority(
request: Union[ProbabilityRequest, Request]
) -> int:
if (
not hasattr(request, "metadata")
or not request.metadata
or request.metadata.probability is None
):
return 0
return int(100 * request.metadata.probability)

def get_parse_navigation_request(
self,
request: Union[ProbabilityRequest, Request],
callback: Optional[Callable] = None,
page_params: Optional[Dict[str, Any]] = None,
priority: Optional[int] = None,
page_type: str = "productNavigation",
) -> scrapy.Request:
callback = callback or self.parse_navigation

return request.to_scrapy(
callback=callback,
priority=priority or self.get_parse_navigation_request_priority(request),
meta={
"page_params": page_params or {},
"crawling_logs": {
"name": request.name or "",
"probability": request.get_probability(),
"page_type": page_type,
},
},
)

def get_subcategory_request(
self,
request: Union[ProbabilityRequest, Request],
callback: Optional[Callable] = None,
page_params: Optional[Dict[str, Any]] = None,
priority: Optional[int] = None,
) -> scrapy.Request:
page_type = "subCategories"
request_name = request.name or ""
if "[heuristics]" not in request_name:
page_params = None
else:
page_type = "productNavigation-heuristics"
request.name = request_name.replace("[heuristics]", "").strip()
return self.get_parse_navigation_request(
request,
callback,
page_params,
priority,
page_type,
)

def get_nextpage_request(
self,
request: Union[ProbabilityRequest, Request],
callback: Optional[Callable] = None,
page_params: Optional[Dict[str, Any]] = None,
):
return self.get_parse_navigation_request(
request, callback, page_params, self._NEXT_PAGE_PRIORITY, "nextPage"
)

def get_parse_product_request_priority(self, request: ProbabilityRequest) -> int:
probability = request.get_probability() or 0
return int(100 * probability) + self._NEXT_PAGE_PRIORITY

def get_parse_product_request(
self, request: ProbabilityRequest, callback: Optional[Callable] = None
) -> scrapy.Request:
callback = callback or self.parse_product
priority = self.get_parse_product_request_priority(request)

probability = request.get_probability()

scrapy_request = request.to_scrapy(
callback=callback,
priority=priority,
meta={
"crawling_logs": {
"name": request.name,
"probability": probability,
"page_type": "product",
}
},
)
scrapy_request.meta["allow_offsite"] = True
return scrapy_request
Loading

0 comments on commit 015c590

Please sign in to comment.