From 762367c653453bdba2e281a5bc524e224ee5ca4d Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Mon, 4 Nov 2024 17:23:23 -0300 Subject: [PATCH 1/2] fix: use new endpoints for mexico_nuevo_leon_* --- .../spiders/mexico_nuevo_leon_records.py | 23 +++++++++++------- .../spiders/mexico_nuevo_leon_releases.py | 24 ++++++++++++------- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/kingfisher_scrapy/spiders/mexico_nuevo_leon_records.py b/kingfisher_scrapy/spiders/mexico_nuevo_leon_records.py index 33ed9306b..c29ce5e54 100644 --- a/kingfisher_scrapy/spiders/mexico_nuevo_leon_records.py +++ b/kingfisher_scrapy/spiders/mexico_nuevo_leon_records.py @@ -1,23 +1,28 @@ -import scrapy +from kingfisher_scrapy.base_spiders import PeriodicSpider +from kingfisher_scrapy.util import browser_user_agent, parameters -from kingfisher_scrapy.base_spiders import SimpleSpider - -class MexicoNuevoLeonRecords(SimpleSpider): +class MexicoNuevoLeonRecords(PeriodicSpider): """ Domain Secretaría de Movilidad y Planeación Urbana de Nuevo León + Spider arguments + from_date + Download only data from this year onward (YYYY format). Defaults to '2013'. + until_date + Download only data until this year (YYYY format). Defaults to the current year. Bulk download documentation https://smpu.nl.gob.mx/transparencia/publicaciones """ name = 'mexico_nuevo_leon_records' + user_agent = browser_user_agent # to avoid HTTP 403 # SimpleSpider data_type = 'record_package' - def start_requests(self): - yield scrapy.Request( - 'https://smpu.nl.gob.mx/siasi_ws/api/ocds/DescargarRecordPackage', - meta={'file_name': 'records.json'} - ) + # PeriodicSpider + date_format = 'year' + pattern = 'https://smpu.nl.gob.mx/siasi_ws/api/ocds/ListarProduccionXAnio?anio=%5B%7B"value":"{0}"%7D%5D' + formatter = staticmethod(parameters('anio')) + default_from_date = '2013' diff --git a/kingfisher_scrapy/spiders/mexico_nuevo_leon_releases.py b/kingfisher_scrapy/spiders/mexico_nuevo_leon_releases.py index 60418ad25..f15d686fb 100644 --- a/kingfisher_scrapy/spiders/mexico_nuevo_leon_releases.py +++ b/kingfisher_scrapy/spiders/mexico_nuevo_leon_releases.py @@ -1,12 +1,16 @@ -import scrapy +from kingfisher_scrapy.spiders.mexico_nuevo_leon_records import MexicoNuevoLeonRecords +from kingfisher_scrapy.util import components -from kingfisher_scrapy.base_spiders import CompressedFileSpider - -class MexicoNuevoLeonReleases(CompressedFileSpider): +class MexicoNuevoLeonReleases(MexicoNuevoLeonRecords): """ Domain Secretaría de Movilidad y Planeación Urbana de Nuevo León + Spider arguments + from_date + Download only data from this year onward (YYYY format). Defaults to '2013'. + until_date + Download only data until this year (YYYY format). Defaults to the current year. Bulk download documentation https://smpu.nl.gob.mx/transparencia/acerca-del-proyecto """ @@ -19,9 +23,11 @@ class MexicoNuevoLeonReleases(CompressedFileSpider): # SimpleSpider data_type = 'release_package' - # CompressedFileSpider - file_name_must_contain = 'ReleasePackage' + # PeriodicSpider + start_requests_callback = 'parse_list' - def start_requests(self): - url = 'https://smpu.nl.gob.mx/acceso/DatosAbiertos/JSONsInfraestructuraAbierta.rar' - yield scrapy.Request(url, meta={'file_name': 'all.rar'}) + def parse_list(self, response): + for record_package in response.json(): + for record in record_package['records']: + for release in record['releases']: + yield self.build_request(release['url'], formatter=components(-1)) From 760e8a00978ac6459f430003f1b3ea98c70f88e3 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Tue, 5 Nov 2024 14:42:23 -0300 Subject: [PATCH 2/2] feat: add mexico_nuevo_leon_base --- .../spiders/mexico_nuevo_leon_base.py | 14 ++++++++++++++ .../spiders/mexico_nuevo_leon_records.py | 12 ++---------- .../spiders/mexico_nuevo_leon_releases.py | 7 ++++--- 3 files changed, 20 insertions(+), 13 deletions(-) create mode 100644 kingfisher_scrapy/spiders/mexico_nuevo_leon_base.py diff --git a/kingfisher_scrapy/spiders/mexico_nuevo_leon_base.py b/kingfisher_scrapy/spiders/mexico_nuevo_leon_base.py new file mode 100644 index 000000000..72262c339 --- /dev/null +++ b/kingfisher_scrapy/spiders/mexico_nuevo_leon_base.py @@ -0,0 +1,14 @@ +from kingfisher_scrapy.base_spiders import PeriodicSpider +from kingfisher_scrapy.util import browser_user_agent, parameters + + +class MexicoNuevoLeonBase(PeriodicSpider): + user_agent = browser_user_agent # to avoid HTTP 403 + + # BaseSpider + date_format = 'year' + default_from_date = '2013' + + # PeriodicSpider + pattern = 'https://smpu.nl.gob.mx/siasi_ws/api/ocds/ListarProduccionXAnio?anio=%5B%7B"value":"{0}"%7D%5D' + formatter = staticmethod(parameters('anio')) diff --git a/kingfisher_scrapy/spiders/mexico_nuevo_leon_records.py b/kingfisher_scrapy/spiders/mexico_nuevo_leon_records.py index c29ce5e54..247e07f56 100644 --- a/kingfisher_scrapy/spiders/mexico_nuevo_leon_records.py +++ b/kingfisher_scrapy/spiders/mexico_nuevo_leon_records.py @@ -1,8 +1,7 @@ -from kingfisher_scrapy.base_spiders import PeriodicSpider -from kingfisher_scrapy.util import browser_user_agent, parameters +from kingfisher_scrapy.spiders.mexico_nuevo_leon_base import MexicoNuevoLeonBase -class MexicoNuevoLeonRecords(PeriodicSpider): +class MexicoNuevoLeonRecords(MexicoNuevoLeonBase): """ Domain Secretaría de Movilidad y Planeación Urbana de Nuevo León @@ -16,13 +15,6 @@ class MexicoNuevoLeonRecords(PeriodicSpider): """ name = 'mexico_nuevo_leon_records' - user_agent = browser_user_agent # to avoid HTTP 403 # SimpleSpider data_type = 'record_package' - - # PeriodicSpider - date_format = 'year' - pattern = 'https://smpu.nl.gob.mx/siasi_ws/api/ocds/ListarProduccionXAnio?anio=%5B%7B"value":"{0}"%7D%5D' - formatter = staticmethod(parameters('anio')) - default_from_date = '2013' diff --git a/kingfisher_scrapy/spiders/mexico_nuevo_leon_releases.py b/kingfisher_scrapy/spiders/mexico_nuevo_leon_releases.py index f15d686fb..27086f340 100644 --- a/kingfisher_scrapy/spiders/mexico_nuevo_leon_releases.py +++ b/kingfisher_scrapy/spiders/mexico_nuevo_leon_releases.py @@ -1,8 +1,8 @@ -from kingfisher_scrapy.spiders.mexico_nuevo_leon_records import MexicoNuevoLeonRecords -from kingfisher_scrapy.util import components +from kingfisher_scrapy.spiders.mexico_nuevo_leon_base import MexicoNuevoLeonBase +from kingfisher_scrapy.util import components, handle_http_error -class MexicoNuevoLeonReleases(MexicoNuevoLeonRecords): +class MexicoNuevoLeonReleases(MexicoNuevoLeonBase): """ Domain Secretaría de Movilidad y Planeación Urbana de Nuevo León @@ -26,6 +26,7 @@ class MexicoNuevoLeonReleases(MexicoNuevoLeonRecords): # PeriodicSpider start_requests_callback = 'parse_list' + @handle_http_error def parse_list(self, response): for record_package in response.json(): for record in record_package['records']: