check for more reasons why the scraper failed

juniorguru · Jan 19, 2024 · c6ecb99 · c6ecb99
1 parent 6824018
commit c6ecb99
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 25 deletions.
diff --git a/juniorguru_plucker/actors.py b/juniorguru_plucker/actors.py
@@ -4,12 +4,12 @@
 import nest_asyncio
 from apify import Actor
 from scrapy import Item, Spider
-from scrapy.crawler import CrawlerProcess
 from scrapy.settings import BaseSettings, Settings
 from scrapy.spiderloader import SpiderLoader as BaseSpiderLoader
-from scrapy.statscollectors import StatsCollector
 from scrapy.utils.reactor import install_reactor
 
+from juniorguru_plucker.spiders import run_spider
+
 
 async def run_actor(settings: Settings, spider_class: Type[Spider]) -> None:
     async with Actor:
@@ -20,25 +20,6 @@ async def run_actor(settings: Settings, spider_class: Type[Spider]) -> None:
         run_spider(settings, spider_class)
 
 
-def run_spider(settings: Settings, spider_class: Type[Spider]):
-    crawler_process = CrawlerProcess(settings, install_root_handler=False)
-    crawler_process.crawl(spider_class)
-    stats_collector = get_stats_collector(crawler_process)
-    crawler_process.start()
-
-    if exc_count := stats_collector.get_value("spider_exceptions"):
-        raise RuntimeError(f"Scraping failed with {exc_count} exceptions raised")
-    if error_count := stats_collector.get_value("log_count/ERROR"):
-        raise RuntimeError(f"Scraping failed with {error_count} errors logged")
-
-
-def get_stats_collector(crawler_process: CrawlerProcess) -> StatsCollector:
-    if len(crawler_process.crawlers) != 1:
-        raise RuntimeError("Exactly one crawler expected")
-    crawler = crawler_process.crawlers.pop()
-    return crawler.stats
-
-
 def apply_apify_settings(
     settings: Settings, proxy_config: dict | None = None
 ) -> Settings:

diff --git a/juniorguru_plucker/cli.py b/juniorguru_plucker/cli.py
@@ -25,8 +25,8 @@
     get_spider_module_name,
     iter_actor_paths,
     run_actor,
-    run_spider,
 )
+from juniorguru_plucker.spiders import run_spider
 
 
 logger = logging.getLogger("juniorguru_plucker")

diff --git a/juniorguru_plucker/settings.py b/juniorguru_plucker/settings.py
@@ -23,4 +23,4 @@
 
 CLOSESPIDER_ERRORCOUNT = 1
 
-CLOSESPIDER_TIMEOUT_NO_ITEM = 30  # seconds
+CLOSESPIDER_TIMEOUT_NO_ITEM = 60  # seconds
diff --git a/juniorguru_plucker/spiders.py b/juniorguru_plucker/spiders.py
@@ -1,8 +1,38 @@
-from scrapy import Spider as BaseSpider
+from scrapy import Spider
+from scrapy.crawler import CrawlerProcess
 from scrapy.settings import Settings
+from scrapy.statscollectors import StatsCollector
 
 
-class JobSpider(BaseSpider):
+def run_spider(settings: Settings, spider_class: type[Spider]):
+    crawler_process = CrawlerProcess(settings, install_root_handler=False)
+    crawler_process.crawl(spider_class)
+    stats_collector = get_stats_collector(crawler_process)
+    crawler_process.start()
+
+    if reason := stats_collector.get_value("finish_reason"):
+        if reason != "finished":
+            raise RuntimeError(f"Scraping didn't finish properly, reason: {reason}")
+    if item_count := stats_collector.get_value(
+        "item_dropped_reasons_count/MissingRequiredFields"
+    ):
+        raise RuntimeError(
+            f"Scraping failed with {item_count} items dropped because of missing required fields"
+        )
+    if exc_count := stats_collector.get_value("spider_exceptions"):
+        raise RuntimeError(f"Scraping failed with {exc_count} exceptions raised")
+    if error_count := stats_collector.get_value("log_count/ERROR"):
+        raise RuntimeError(f"Scraping failed with {error_count} errors logged")
+
+
+def get_stats_collector(crawler_process: CrawlerProcess) -> StatsCollector:
+    if len(crawler_process.crawlers) != 1:
+        raise RuntimeError("Exactly one crawler expected")
+    crawler = crawler_process.crawlers.pop()
+    return crawler.stats
+
+
+class JobSpider(Spider):
     extra_item_pipelines = {
         "juniorguru_plucker.pipelines.required_fields_filter.Pipeline": 50,
         "juniorguru_plucker.pipelines.short_description_filter.Pipeline": 100,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -23,4 +23,4 @@

		CLOSESPIDER_ERRORCOUNT = 1

		CLOSESPIDER_TIMEOUT_NO_ITEM = 30 # seconds
		CLOSESPIDER_TIMEOUT_NO_ITEM = 60 # seconds