Skip to content

Commit

Permalink
Merge pull request #385 from open-contracting/29-kingfisher-collect-w…
Browse files Browse the repository at this point in the history
…arnings

feat: add kingfisher collect log warnings with scrapyloganalyzer
  • Loading branch information
jpmckinney authored Jan 23, 2025
2 parents b109a12 + 311b62b commit f8d7c03
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 9 deletions.
21 changes: 21 additions & 0 deletions data_registry/migrations/0060_alter_job_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Generated by Django 4.2.18 on 2025-01-20 13:58

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("data_registry", "0059_job_process_notes"),
]

operations = [
migrations.AlterField(
model_name="job",
name="context",
field=models.JSONField(
blank=True,
default=dict,
help_text="<dl><dt><code>spider</code></dt><dd>The name of the spider in Kingfisher Collect</dd><dt><code>data_version</code></dt><dd>The data version of the crawl in Kingfisher Collect</dd><dt><code>job_id</code></dt><dd>The ID of the job in Scrapyd</dd><dt><code>scrapy_log</code></dt><dd>A local URL to the log file of the crawl in Scrapyd</dd><dt><code>item_dropped_count</code></dt><dd>The number of items dropped by the crawl</dd><dt><code>invalid_json_count</code></dt><dd>The number of invalid JSON items dropped by the crawl</dd><dt><code>process_id</code></dt><dd>The ID of the base collection in Kingfisher Process</dd><dt><code>process_id_pelican</code></dt><dd>The ID of the compiled collection in Kingfisher Process</dd><dt><code>pelican_id</code></dt><dd>The ID of the dataset in Pelican</dd><dt><code>pelican_dataset_name</code></dt><dd>The name of the dataset in Pelican</dd></dl>",
),
),
]
4 changes: 4 additions & 0 deletions data_registry/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ class Status(models.TextChoices):
"<dd>The ID of the job in Scrapyd</dd>"
"<dt><code>scrapy_log</code></dt>"
"<dd>A local URL to the log file of the crawl in Scrapyd</dd>"
"<dt><code>item_dropped_count</code></dt>"
"<dd>The number of items dropped by the crawl</dd>"
"<dt><code>invalid_json_count</code></dt>"
"<dd>The number of invalid JSON items dropped by the crawl</dd>"
"<dt><code>process_id</code></dt>"
"<dd>The ID of the base collection in Kingfisher Process</dd>"
"<dt><code>process_id_pelican</code></dt>"
Expand Down
16 changes: 16 additions & 0 deletions data_registry/process_manager/task/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import requests
from django.conf import settings
from scrapyloganalyzer import ScrapyLogFile

from data_registry.exceptions import ConfigurationError, RecoverableError, UnexpectedError
from data_registry.models import Task
Expand Down Expand Up @@ -105,6 +106,21 @@ def get_status(self):
if "process_id" not in self.job.context or "data_version" not in self.job.context:
raise UnexpectedError("Unable to retrieve collection ID and data version from Scrapy log")

scrapy_log = ScrapyLogFile(scrapy_log_url)

if not scrapy_log.is_finished():
logger.warning("%s: crawl finish reason: %s", self, scrapy_log.logparser["finish_reason"])
if scrapy_log.error_rate:
logger.warning("%s: crawl error rate: %s", self, scrapy_log.error_rate)
for key in ("item_dropped_count", "invalid_json_count"):
if value := scrapy_log.logparser["crawler_stats"].get(key):
logger.warning("%s: crawl %s: %s", self, key, value)
self.job.context[key] = value
self.job.save(update_fields=["modified", "context"])
for key in scrapy_log.logparser["log_categories"]:
if scrapy_log.logparser["log_categories"][key]["count"]:
logger.warning("%s: %s", self, scrapy_log.logparser["log_categories"][key]["details"])

return Task.Status.COMPLETED

raise RecoverableError(f"Unable to find status of Scrapyd job {scrapyd_job_id}")
Expand Down
1 change: 1 addition & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ gunicorn[setproctitle]
markdown-it-py
psycopg2
requests
scrapyloganalyzer
sentry-sdk
yapw[perf]
16 changes: 13 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ django-modeltranslation==0.18.10
# via -r requirements.in
flatterer==0.20.1
# via -r requirements.in
gunicorn[setproctitle]==22.0.0
gunicorn==22.0.0
# via -r requirements.in
idna==3.7
# via requests
ijson==3.1.4
# via flatterer
logparser==0.8.3
# via scrapyloganalyzer
markdown-it-py==2.2.0
# via -r requirements.in
mdurl==0.1.2
Expand All @@ -41,22 +43,30 @@ packaging==24.0
# via gunicorn
pandas==1.5.0
# via flatterer
pexpect==4.9.0
# via logparser
pika==1.3.2
# via yapw
psycopg2==2.9.6
# via -r requirements.in
ptyprocess==0.7.0
# via pexpect
python-dateutil==2.8.2
# via pandas
pytz==2021.1
# via pandas
requests==2.32.3
# via -r requirements.in
scrapyloganalyzer==0.0.1
# via -r requirements.in
sentry-sdk==2.8.0
# via -r requirements.in
setproctitle==1.2.2
# via gunicorn
six==1.16.0
# via python-dateutil
# via
# logparser
# python-dateutil
sqlparse==0.5.0
# via django
typing-extensions==4.7.1
Expand All @@ -67,5 +77,5 @@ urllib3==2.2.2
# via
# requests
# sentry-sdk
yapw[perf]==0.1.4
yapw==0.1.4
# via -r requirements.in
24 changes: 18 additions & 6 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ django-modeltranslation==0.18.10
# via -r requirements.txt
flatterer==0.20.1
# via -r requirements.txt
gunicorn[setproctitle]==22.0.0
gunicorn==22.0.0
# via -r requirements.txt
idna==3.7
# via
Expand All @@ -40,6 +40,10 @@ ijson==3.1.4
# via
# -r requirements.txt
# flatterer
logparser==0.8.3
# via
# -r requirements.txt
# scrapyloganalyzer
markdown-it-py==2.2.0
# via -r requirements.txt
mdurl==0.1.2
Expand All @@ -54,7 +58,6 @@ orjson==3.9.15
# via
# -r requirements.txt
# flatterer
# yapw
packaging==24.0
# via
# -r requirements.txt
Expand All @@ -63,6 +66,10 @@ pandas==1.5.0
# via
# -r requirements.txt
# flatterer
pexpect==4.9.0
# via
# -r requirements.txt
# logparser
pika==1.3.2
# via
# -r requirements.txt
Expand All @@ -71,6 +78,10 @@ psycopg2==2.9.6
# via -r requirements.txt
psycopg2-binary==2.9.2
# via -r requirements_dev.in
ptyprocess==0.7.0
# via
# -r requirements.txt
# pexpect
python-dateutil==2.8.2
# via
# -r requirements.txt
Expand All @@ -81,15 +92,16 @@ pytz==2021.1
# pandas
requests==2.32.3
# via -r requirements.txt
scrapyloganalyzer==0.0.1
# via -r requirements.txt
sentry-sdk==2.8.0
# via -r requirements.txt
setproctitle==1.2.2
# via
# -r requirements.txt
# gunicorn
# via -r requirements.txt
six==1.16.0
# via
# -r requirements.txt
# logparser
# python-dateutil
sqlparse==0.5.0
# via
Expand All @@ -105,5 +117,5 @@ urllib3==2.2.2
# -r requirements.txt
# requests
# sentry-sdk
yapw[perf]==0.1.4
yapw==0.1.4
# via -r requirements.txt

0 comments on commit f8d7c03

Please sign in to comment.