From f7698873c59ce9cce67e77fd8939146b9efbf57c Mon Sep 17 00:00:00 2001 From: Nilay Kumar Date: Fri, 24 Jan 2025 02:02:42 -0500 Subject: [PATCH 1/6] Added EPA PCAP metadata and archiver --- src/pudl_archiver/archivers/epapcap.py | 74 ++++++++++++++++++++++++++ src/pudl_archiver/metadata/sources.py | 18 +++++++ 2 files changed, 92 insertions(+) create mode 100644 src/pudl_archiver/archivers/epapcap.py diff --git a/src/pudl_archiver/archivers/epapcap.py b/src/pudl_archiver/archivers/epapcap.py new file mode 100644 index 00000000..b0f5302b --- /dev/null +++ b/src/pudl_archiver/archivers/epapcap.py @@ -0,0 +1,74 @@ +"""Download EPA PCAP data.""" + +import re +from pathlib import Path + +from pudl_archiver.archivers.classes import ( + AbstractDatasetArchiver, + ArchiveAwaitable, + ResourceInfo, +) +from pudl_archiver.frictionless import ZipLayout + +BASE_URL = ( + "https://www.epa.gov/inflation-reduction-act/priority-climate-action-plan-directory" +) +DATA_TABLE_URLS = [ + "https://www.epa.gov/inflation-reduction-act/ghg-inventory-searchable-table", + "https://www.epa.gov/inflation-reduction-act/ghg-reduction-measures-searchable-table", + "https://www.epa.gov/inflation-reduction-act/co-pollutant-benefits-searchable-table", + "https://www.epa.gov/inflation-reduction-act/lidac-benefits-searchable-table", +] + + +class EpaPcapArchiver(AbstractDatasetArchiver): + """EPA PCAP archiver.""" + + name = "epapcap" + + async def get_resources(self) -> ArchiveAwaitable: + """Download EPA PCAP resources.""" + yield self.get_resource() + + async def get_resource(self) -> ResourceInfo: + """Download EPA PCAP resources.""" + zip_path = self.download_directory / "epapcap.zip" + data_paths_in_archive = set() + # Download the three Excel files first + excel_pattern = re.compile(r"priority.*\.xlsx") + for link in await self.get_hyperlinks(BASE_URL, excel_pattern): + await self.download_helper(link, zip_path, data_paths_in_archive) + + # Download all PDFs from each searchable table + pdf_pattern = re.compile(r".*\.pdf") + for data_table_url in DATA_TABLE_URLS: + for link in await self.get_hyperlinks(data_table_url, pdf_pattern): + # The second and third searchable tables links are relative + # to the TLD, so we convert them to absolute links + prefix = "https://www.epa.gov" + if not link.startswith("http"): + link = prefix + link + await self.download_helper(link, zip_path, data_paths_in_archive) + + return ResourceInfo( + local_path=zip_path, + partitions={}, + laybout=ZipLayout(file_paths=data_paths_in_archive), + ) + + async def download_helper(self, link, zip_path, data_paths_in_archive): + """Download file and add to archive.""" + filename = Path(link).name + # Do nothing if we're going to end up duplicating a file + # Many of the PDFs are shared between the multiple searchable tables + if filename in data_paths_in_archive: + return + download_path = self.download_directory / filename + await self.download_file(link, download_path) + self.add_to_archive( + zip_path=zip_path, + filename=filename, + blob=download_path.open("rb"), + ) + data_paths_in_archive.add(filename) + download_path.unlink() diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py index 731c355b..0135fe35 100644 --- a/src/pudl_archiver/metadata/sources.py +++ b/src/pudl_archiver/metadata/sources.py @@ -398,4 +398,22 @@ "license_pudl": LICENSES["cc-by-4.0"], "contributors": [CONTRIBUTORS["catalyst-cooperative"]], }, + "epapcap": { + "title": "EPA -- Priority Climate Action Plan", + "path": "https://www.epa.gov/inflation-reduction-act/priority-climate-action-plan-directory", + "description": ( + "EPA’s Priority Climate Action Plan (PCAP) Directory organizes data collected from 211 " + "PCAPs submitted by states, Metropolitan Statistical Areas (MSAs), Tribes, and territories " + "under EPA’s Climate Pollution Reduction Grants (CPRG) program. PCAPs are a compilation " + "of each jurisdiction’s identified priority actions (or measures) to reduce greenhouse " + "gas (GHG) emissions. The directory presents information from more than 30 data categories " + "related to GHG inventories, GHG reduction measures, benefits for low-income and " + "disadvantaged communities (LIDACs), and other PCAP elements." + ), + "working_partitions": {}, + "keywords": sorted({}), + "license_raw": LICENSES["us-govt"], + "license_pudl": LICENSES["cc-by-4.0"], + "contributors": [CONTRIBUTORS["catalyst-cooperative"]], + }, } From 71ad55c1043cc1e06cf28c84d06e26cc7e01dcb7 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Fri, 24 Jan 2025 12:26:07 -0500 Subject: [PATCH 2/6] Add keywords --- src/pudl_archiver/metadata/sources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py index 0135fe35..4bd34ceb 100644 --- a/src/pudl_archiver/metadata/sources.py +++ b/src/pudl_archiver/metadata/sources.py @@ -411,7 +411,7 @@ "disadvantaged communities (LIDACs), and other PCAP elements." ), "working_partitions": {}, - "keywords": sorted({}), + "keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg", "emissions"}), "license_raw": LICENSES["us-govt"], "license_pudl": LICENSES["cc-by-4.0"], "contributors": [CONTRIBUTORS["catalyst-cooperative"]], From 4822ea34a774068cb6e760e09a82bcc1a6a0c29a Mon Sep 17 00:00:00 2001 From: e-belfer Date: Fri, 24 Jan 2025 17:08:36 -0500 Subject: [PATCH 3/6] Fix PCAP metadata and make an EPA folder for EPA archivers --- src/pudl_archiver/archivers/{ => epa}/epacamd_eia.py | 0 src/pudl_archiver/archivers/{ => epa}/epacems.py | 0 src/pudl_archiver/archivers/{ => epa}/epapcap.py | 0 src/pudl_archiver/metadata/sources.py | 4 ++-- 4 files changed, 2 insertions(+), 2 deletions(-) rename src/pudl_archiver/archivers/{ => epa}/epacamd_eia.py (100%) rename src/pudl_archiver/archivers/{ => epa}/epacems.py (100%) rename src/pudl_archiver/archivers/{ => epa}/epapcap.py (100%) diff --git a/src/pudl_archiver/archivers/epacamd_eia.py b/src/pudl_archiver/archivers/epa/epacamd_eia.py similarity index 100% rename from src/pudl_archiver/archivers/epacamd_eia.py rename to src/pudl_archiver/archivers/epa/epacamd_eia.py diff --git a/src/pudl_archiver/archivers/epacems.py b/src/pudl_archiver/archivers/epa/epacems.py similarity index 100% rename from src/pudl_archiver/archivers/epacems.py rename to src/pudl_archiver/archivers/epa/epacems.py diff --git a/src/pudl_archiver/archivers/epapcap.py b/src/pudl_archiver/archivers/epa/epapcap.py similarity index 100% rename from src/pudl_archiver/archivers/epapcap.py rename to src/pudl_archiver/archivers/epa/epapcap.py diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py index 4bd34ceb..411735df 100644 --- a/src/pudl_archiver/metadata/sources.py +++ b/src/pudl_archiver/metadata/sources.py @@ -399,7 +399,7 @@ "contributors": [CONTRIBUTORS["catalyst-cooperative"]], }, "epapcap": { - "title": "EPA -- Priority Climate Action Plan", + "title": "EPA PCAP -- Priority Climate Action Plan", "path": "https://www.epa.gov/inflation-reduction-act/priority-climate-action-plan-directory", "description": ( "EPA’s Priority Climate Action Plan (PCAP) Directory organizes data collected from 211 " @@ -411,7 +411,7 @@ "disadvantaged communities (LIDACs), and other PCAP elements." ), "working_partitions": {}, - "keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg", "emissions"}), + "keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg"}), "license_raw": LICENSES["us-govt"], "license_pudl": LICENSES["cc-by-4.0"], "contributors": [CONTRIBUTORS["catalyst-cooperative"]], From 535f8ab6ea3db5fc81f4c545639bcbaa002df084 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Fri, 24 Jan 2025 18:06:00 -0500 Subject: [PATCH 4/6] Fix indentation of download helper --- src/pudl_archiver/archivers/epa/__init__.py | 1 + src/pudl_archiver/archivers/epa/epapcap.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 src/pudl_archiver/archivers/epa/__init__.py diff --git a/src/pudl_archiver/archivers/epa/__init__.py b/src/pudl_archiver/archivers/epa/__init__.py new file mode 100644 index 00000000..30db5dbc --- /dev/null +++ b/src/pudl_archiver/archivers/epa/__init__.py @@ -0,0 +1 @@ +"""Module implements archivers for all EPA forms integrated into PUDL.""" diff --git a/src/pudl_archiver/archivers/epa/epapcap.py b/src/pudl_archiver/archivers/epa/epapcap.py index b0f5302b..8f0c9c76 100644 --- a/src/pudl_archiver/archivers/epa/epapcap.py +++ b/src/pudl_archiver/archivers/epa/epapcap.py @@ -48,7 +48,7 @@ async def get_resource(self) -> ResourceInfo: prefix = "https://www.epa.gov" if not link.startswith("http"): link = prefix + link - await self.download_helper(link, zip_path, data_paths_in_archive) + await self.download_helper(link, zip_path, data_paths_in_archive) return ResourceInfo( local_path=zip_path, From 4d9291855605c8c5a5d3efd03735adfa444311c5 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 28 Jan 2025 16:19:38 -0500 Subject: [PATCH 5/6] Add prod doi and epapcap to GHA --- .github/workflows/run-archiver.yml | 4 ++-- src/pudl_archiver/package_data/zenodo_doi.yaml | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-archiver.yml b/.github/workflows/run-archiver.yml index dd5dfb50..a201046a 100644 --- a/.github/workflows/run-archiver.yml +++ b/.github/workflows/run-archiver.yml @@ -6,7 +6,7 @@ on: inputs: datasets: description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").' - default: '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' + default: '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' required: true type: string create_github_issue: @@ -26,7 +26,7 @@ jobs: strategy: matrix: # Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here. - dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }} + dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }} fail-fast: false runs-on: ubuntu-latest permissions: diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml index 4eb77c28..da448b26 100644 --- a/src/pudl_archiver/package_data/zenodo_doi.yaml +++ b/src/pudl_archiver/package_data/zenodo_doi.yaml @@ -46,6 +46,9 @@ epacamd_eia: epacems: production_doi: 10.5281/zenodo.10233185 sandbox_doi: 10.5072/zenodo.12943 +epapcap: + production_doi: 10.5281/zenodo.14757598 + #sandbox_doi: 10.5072/zenodo.12943 # Update!! ferc1: production_doi: 10.5281/zenodo.4127043 sandbox_doi: 10.5072/zenodo.3267 From 8a4d117251ce00b55db28697e59deca6103c7811 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 28 Jan 2025 16:25:02 -0500 Subject: [PATCH 6/6] Note that sandbox DOI isn't complete --- src/pudl_archiver/package_data/zenodo_doi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml index da448b26..ceece57e 100644 --- a/src/pudl_archiver/package_data/zenodo_doi.yaml +++ b/src/pudl_archiver/package_data/zenodo_doi.yaml @@ -48,7 +48,7 @@ epacems: sandbox_doi: 10.5072/zenodo.12943 epapcap: production_doi: 10.5281/zenodo.14757598 - #sandbox_doi: 10.5072/zenodo.12943 # Update!! + #sandbox_doi: # Update!! ferc1: production_doi: 10.5281/zenodo.4127043 sandbox_doi: 10.5072/zenodo.3267