Skip to content

Commit

Permalink
Remove custom priorities from the job posting spider.
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR committed Nov 18, 2024
1 parent 40fd2aa commit 2acf2a8
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 27 deletions.
4 changes: 0 additions & 4 deletions tests/test_job_posting.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ def test_crawl():
assert requests[1].callback == spider.parse_job_posting
assert requests[2].url == nextpage_url
assert requests[2].callback == spider.parse_navigation
assert [request.priority for request in requests] == [199, 183, 100]

# nextpage
navigation = JobPostingNavigation.from_dict(
Expand All @@ -118,7 +117,6 @@ def test_crawl():
assert requests[0].callback == spider.parse_job_posting
assert requests[1].url == item_urls[1]
assert requests[1].callback == spider.parse_job_posting
assert [request.priority for request in requests] == [199, 183]


def test_crawl_strategy_direct_item():
Expand Down Expand Up @@ -482,7 +480,6 @@ def test_get_nextpage_request():
scrapy_request = spider.get_nextpage_request(request)
assert isinstance(scrapy_request, scrapy.Request)
assert scrapy_request.callback == parse_navigation
assert scrapy_request.priority == 100
assert scrapy_request.meta == {
"page_params": {},
"crawling_logs": {"name": "", "probability": None, "page_type": "nextPage"},
Expand All @@ -501,7 +498,6 @@ def test_get_parse_navigation_request():
scrapy_request = spider.get_parse_navigation_request(request)
assert isinstance(scrapy_request, scrapy.Request)
assert scrapy_request.callback == parse_navigation
assert scrapy_request.priority == 0
assert scrapy_request.meta == {
"page_params": {},
"crawling_logs": {
Expand Down
24 changes: 1 addition & 23 deletions zyte_spider_templates/spiders/job_posting.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,31 +195,17 @@ def parse_job_posting(
f"less than threshold of 0.1:\n{job_posting}"
)

@staticmethod
def get_parse_navigation_request_priority(
request: Union[ProbabilityRequest, Request]
) -> int:
if (
not hasattr(request, "metadata")
or not request.metadata
or request.metadata.probability is None
):
return 0
return int(100 * request.metadata.probability)

def get_parse_navigation_request(
self,
request: Union[ProbabilityRequest, Request],
callback: Optional[Callable] = None,
page_params: Optional[Dict[str, Any]] = None,
priority: Optional[int] = None,
page_type: str = "jobPostingNavigation",
) -> Request:
callback = callback or self.parse_navigation

return request.to_scrapy(
callback=callback,
priority=priority or self.get_parse_navigation_request_priority(request),
meta={
"page_params": page_params or {},
"crawling_logs": {
Expand All @@ -237,26 +223,18 @@ def get_nextpage_request(
page_params: Optional[Dict[str, Any]] = None,
):
return self.get_parse_navigation_request(
request, callback, page_params, self._NEXT_PAGE_PRIORITY, "nextPage"
request, callback, page_params, "nextPage"
)

def get_parse_job_posting_request_priority(
self, request: ProbabilityRequest
) -> int:
probability = request.get_probability() or 0
return int(100 * probability) + self._NEXT_PAGE_PRIORITY

def get_parse_job_posting_request(
self, request: ProbabilityRequest, callback: Optional[Callable] = None
) -> Request:
callback = callback or self.parse_job_posting
priority = self.get_parse_job_posting_request_priority(request)

probability = request.get_probability()

scrapy_request = request.to_scrapy(
callback=callback,
priority=priority,
meta={
"crawling_logs": {
"name": request.name,
Expand Down

0 comments on commit 2acf2a8

Please sign in to comment.