Skip to content

Commit

Permalink
check for more reasons why the scraper failed
Browse files Browse the repository at this point in the history
  • Loading branch information
honzajavorek committed Jan 19, 2024
1 parent 6824018 commit c6ecb99
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 25 deletions.
23 changes: 2 additions & 21 deletions juniorguru_plucker/actors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
import nest_asyncio
from apify import Actor
from scrapy import Item, Spider
from scrapy.crawler import CrawlerProcess
from scrapy.settings import BaseSettings, Settings
from scrapy.spiderloader import SpiderLoader as BaseSpiderLoader
from scrapy.statscollectors import StatsCollector
from scrapy.utils.reactor import install_reactor

from juniorguru_plucker.spiders import run_spider


async def run_actor(settings: Settings, spider_class: Type[Spider]) -> None:
async with Actor:
Expand All @@ -20,25 +20,6 @@ async def run_actor(settings: Settings, spider_class: Type[Spider]) -> None:
run_spider(settings, spider_class)


def run_spider(settings: Settings, spider_class: Type[Spider]):
crawler_process = CrawlerProcess(settings, install_root_handler=False)
crawler_process.crawl(spider_class)
stats_collector = get_stats_collector(crawler_process)
crawler_process.start()

if exc_count := stats_collector.get_value("spider_exceptions"):
raise RuntimeError(f"Scraping failed with {exc_count} exceptions raised")
if error_count := stats_collector.get_value("log_count/ERROR"):
raise RuntimeError(f"Scraping failed with {error_count} errors logged")


def get_stats_collector(crawler_process: CrawlerProcess) -> StatsCollector:
if len(crawler_process.crawlers) != 1:
raise RuntimeError("Exactly one crawler expected")
crawler = crawler_process.crawlers.pop()
return crawler.stats


def apply_apify_settings(
settings: Settings, proxy_config: dict | None = None
) -> Settings:
Expand Down
2 changes: 1 addition & 1 deletion juniorguru_plucker/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
get_spider_module_name,
iter_actor_paths,
run_actor,
run_spider,
)
from juniorguru_plucker.spiders import run_spider


logger = logging.getLogger("juniorguru_plucker")
Expand Down
2 changes: 1 addition & 1 deletion juniorguru_plucker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@

CLOSESPIDER_ERRORCOUNT = 1

CLOSESPIDER_TIMEOUT_NO_ITEM = 30 # seconds
CLOSESPIDER_TIMEOUT_NO_ITEM = 60 # seconds
34 changes: 32 additions & 2 deletions juniorguru_plucker/spiders.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,38 @@
from scrapy import Spider as BaseSpider
from scrapy import Spider
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.statscollectors import StatsCollector


class JobSpider(BaseSpider):
def run_spider(settings: Settings, spider_class: type[Spider]):
crawler_process = CrawlerProcess(settings, install_root_handler=False)
crawler_process.crawl(spider_class)
stats_collector = get_stats_collector(crawler_process)
crawler_process.start()

if reason := stats_collector.get_value("finish_reason"):
if reason != "finished":
raise RuntimeError(f"Scraping didn't finish properly, reason: {reason}")
if item_count := stats_collector.get_value(
"item_dropped_reasons_count/MissingRequiredFields"
):
raise RuntimeError(
f"Scraping failed with {item_count} items dropped because of missing required fields"
)
if exc_count := stats_collector.get_value("spider_exceptions"):
raise RuntimeError(f"Scraping failed with {exc_count} exceptions raised")
if error_count := stats_collector.get_value("log_count/ERROR"):
raise RuntimeError(f"Scraping failed with {error_count} errors logged")


def get_stats_collector(crawler_process: CrawlerProcess) -> StatsCollector:
if len(crawler_process.crawlers) != 1:
raise RuntimeError("Exactly one crawler expected")
crawler = crawler_process.crawlers.pop()
return crawler.stats


class JobSpider(Spider):
extra_item_pipelines = {
"juniorguru_plucker.pipelines.required_fields_filter.Pipeline": 50,
"juniorguru_plucker.pipelines.short_description_filter.Pipeline": 100,
Expand Down

0 comments on commit c6ecb99

Please sign in to comment.