From e90fcc9f5029c6087362f123d86dfce634778382 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Fri, 3 Jan 2025 15:26:57 -0300 Subject: [PATCH] feat: add kingfisher collect log warnings with scrapyloganalyzer --- data_registry/process_manager/task/collect.py | 8 +++++++ requirements.in | 1 + requirements.txt | 16 ++++++++++--- requirements_dev.txt | 24 ++++++++++++++----- 4 files changed, 40 insertions(+), 9 deletions(-) diff --git a/data_registry/process_manager/task/collect.py b/data_registry/process_manager/task/collect.py index a738610..b2595c3 100644 --- a/data_registry/process_manager/task/collect.py +++ b/data_registry/process_manager/task/collect.py @@ -5,6 +5,7 @@ import requests from django.conf import settings +from scrapyloganalyzer import ScrapyLogFile from data_registry.exceptions import ConfigurationError, RecoverableError, UnexpectedError from data_registry.models import Task @@ -105,6 +106,13 @@ def get_status(self): if "process_id" not in self.job.context or "data_version" not in self.job.context: raise UnexpectedError("Unable to retrieve collection ID and data version from Scrapy log") + scrapy_log = ScrapyLogFile(scrapy_log_url) + for key in scrapy_log.logparser["log_categories"]: + if scrapy_log.logparser["log_categories"][key]["count"] > 0: + logger.warning("%s: %s", self, {scrapy_log.logparser["log_categories"][key]["details"]}) + if scrapy_log.error_rate: + logger.warning("%s: crawl error rate was %s", self, {scrapy_log.error_rate}) + return Task.Status.COMPLETED raise RecoverableError(f"Unable to find status of Scrapyd job {scrapyd_job_id}") diff --git a/requirements.in b/requirements.in index 6f3f73a..d08325d 100644 --- a/requirements.in +++ b/requirements.in @@ -6,5 +6,6 @@ gunicorn[setproctitle] markdown-it-py psycopg2 requests +scrapyloganalyzer sentry-sdk yapw[perf] diff --git a/requirements.txt b/requirements.txt index dbfbe14..5bb1d6c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,12 +21,14 @@ django-modeltranslation==0.18.10 # via -r requirements.in flatterer==0.20.1 # via -r requirements.in -gunicorn[setproctitle]==22.0.0 +gunicorn==22.0.0 # via -r requirements.in idna==3.7 # via requests ijson==3.1.4 # via flatterer +logparser==0.8.3 + # via scrapyloganalyzer markdown-it-py==2.2.0 # via -r requirements.in mdurl==0.1.2 @@ -41,22 +43,30 @@ packaging==24.0 # via gunicorn pandas==1.5.0 # via flatterer +pexpect==4.9.0 + # via logparser pika==1.3.2 # via yapw psycopg2==2.9.6 # via -r requirements.in +ptyprocess==0.7.0 + # via pexpect python-dateutil==2.8.2 # via pandas pytz==2021.1 # via pandas requests==2.32.3 # via -r requirements.in +scrapyloganalyzer==0.0.1 + # via -r requirements.in sentry-sdk==2.8.0 # via -r requirements.in setproctitle==1.2.2 # via gunicorn six==1.16.0 - # via python-dateutil + # via + # logparser + # python-dateutil sqlparse==0.5.0 # via django typing-extensions==4.7.1 @@ -67,5 +77,5 @@ urllib3==2.2.2 # via # requests # sentry-sdk -yapw[perf]==0.1.4 +yapw==0.1.4 # via -r requirements.in diff --git a/requirements_dev.txt b/requirements_dev.txt index f8cda12..5c5b793 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -30,7 +30,7 @@ django-modeltranslation==0.18.10 # via -r requirements.txt flatterer==0.20.1 # via -r requirements.txt -gunicorn[setproctitle]==22.0.0 +gunicorn==22.0.0 # via -r requirements.txt idna==3.7 # via @@ -40,6 +40,10 @@ ijson==3.1.4 # via # -r requirements.txt # flatterer +logparser==0.8.3 + # via + # -r requirements.txt + # scrapyloganalyzer markdown-it-py==2.2.0 # via -r requirements.txt mdurl==0.1.2 @@ -54,7 +58,6 @@ orjson==3.9.15 # via # -r requirements.txt # flatterer - # yapw packaging==24.0 # via # -r requirements.txt @@ -63,6 +66,10 @@ pandas==1.5.0 # via # -r requirements.txt # flatterer +pexpect==4.9.0 + # via + # -r requirements.txt + # logparser pika==1.3.2 # via # -r requirements.txt @@ -71,6 +78,10 @@ psycopg2==2.9.6 # via -r requirements.txt psycopg2-binary==2.9.2 # via -r requirements_dev.in +ptyprocess==0.7.0 + # via + # -r requirements.txt + # pexpect python-dateutil==2.8.2 # via # -r requirements.txt @@ -81,15 +92,16 @@ pytz==2021.1 # pandas requests==2.32.3 # via -r requirements.txt +scrapyloganalyzer==0.0.1 + # via -r requirements.txt sentry-sdk==2.8.0 # via -r requirements.txt setproctitle==1.2.2 - # via - # -r requirements.txt - # gunicorn + # via -r requirements.txt six==1.16.0 # via # -r requirements.txt + # logparser # python-dateutil sqlparse==0.5.0 # via @@ -105,5 +117,5 @@ urllib3==2.2.2 # -r requirements.txt # requests # sentry-sdk -yapw[perf]==0.1.4 +yapw==0.1.4 # via -r requirements.txt