Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Archive EPA PCAP data #544

Merged
merged 11 commits into from
Jan 28, 2025
4 changes: 2 additions & 2 deletions .github/workflows/run-archiver.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
inputs:
datasets:
description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").'
default: '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
default: '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
required: true
type: string
create_github_issue:
Expand All @@ -26,7 +26,7 @@ jobs:
strategy:
matrix:
# Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here.
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
fail-fast: false
runs-on: ubuntu-latest
permissions:
Expand Down
1 change: 1 addition & 0 deletions src/pudl_archiver/archivers/epa/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Module implements archivers for all EPA forms integrated into PUDL."""
74 changes: 74 additions & 0 deletions src/pudl_archiver/archivers/epa/epapcap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""Download EPA PCAP data."""

import re
from pathlib import Path

from pudl_archiver.archivers.classes import (
AbstractDatasetArchiver,
ArchiveAwaitable,
ResourceInfo,
)
from pudl_archiver.frictionless import ZipLayout

BASE_URL = (
"https://www.epa.gov/inflation-reduction-act/priority-climate-action-plan-directory"
)
DATA_TABLE_URLS = [
"https://www.epa.gov/inflation-reduction-act/ghg-inventory-searchable-table",
"https://www.epa.gov/inflation-reduction-act/ghg-reduction-measures-searchable-table",
"https://www.epa.gov/inflation-reduction-act/co-pollutant-benefits-searchable-table",
"https://www.epa.gov/inflation-reduction-act/lidac-benefits-searchable-table",
]


class EpaPcapArchiver(AbstractDatasetArchiver):
"""EPA PCAP archiver."""

name = "epapcap"

async def get_resources(self) -> ArchiveAwaitable:
"""Download EPA PCAP resources."""
yield self.get_resource()

async def get_resource(self) -> ResourceInfo:
"""Download EPA PCAP resources."""
zip_path = self.download_directory / "epapcap.zip"
data_paths_in_archive = set()
# Download the three Excel files first
excel_pattern = re.compile(r"priority.*\.xlsx")
for link in await self.get_hyperlinks(BASE_URL, excel_pattern):
await self.download_helper(link, zip_path, data_paths_in_archive)

# Download all PDFs from each searchable table
pdf_pattern = re.compile(r".*\.pdf")
for data_table_url in DATA_TABLE_URLS:
for link in await self.get_hyperlinks(data_table_url, pdf_pattern):
# The second and third searchable tables links are relative
# to the TLD, so we convert them to absolute links
prefix = "https://www.epa.gov"
if not link.startswith("http"):
link = prefix + link
await self.download_helper(link, zip_path, data_paths_in_archive)

return ResourceInfo(
local_path=zip_path,
partitions={},
laybout=ZipLayout(file_paths=data_paths_in_archive),
)

async def download_helper(self, link, zip_path, data_paths_in_archive):
"""Download file and add to archive."""
filename = Path(link).name
# Do nothing if we're going to end up duplicating a file
# Many of the PDFs are shared between the multiple searchable tables
if filename in data_paths_in_archive:
return
download_path = self.download_directory / filename
await self.download_file(link, download_path)
self.add_to_archive(
zip_path=zip_path,
filename=filename,
blob=download_path.open("rb"),
)
data_paths_in_archive.add(filename)
download_path.unlink()
18 changes: 18 additions & 0 deletions src/pudl_archiver/metadata/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,4 +398,22 @@
"license_pudl": LICENSES["cc-by-4.0"],
"contributors": [CONTRIBUTORS["catalyst-cooperative"]],
},
"epapcap": {
"title": "EPA PCAP -- Priority Climate Action Plan",
"path": "https://www.epa.gov/inflation-reduction-act/priority-climate-action-plan-directory",
"description": (
"EPA’s Priority Climate Action Plan (PCAP) Directory organizes data collected from 211 "
"PCAPs submitted by states, Metropolitan Statistical Areas (MSAs), Tribes, and territories "
"under EPA’s Climate Pollution Reduction Grants (CPRG) program. PCAPs are a compilation "
"of each jurisdiction’s identified priority actions (or measures) to reduce greenhouse "
"gas (GHG) emissions. The directory presents information from more than 30 data categories "
"related to GHG inventories, GHG reduction measures, benefits for low-income and "
"disadvantaged communities (LIDACs), and other PCAP elements."
),
"working_partitions": {},
"keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg"}),
"license_raw": LICENSES["us-govt"],
"license_pudl": LICENSES["cc-by-4.0"],
"contributors": [CONTRIBUTORS["catalyst-cooperative"]],
},
}
3 changes: 3 additions & 0 deletions src/pudl_archiver/package_data/zenodo_doi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ epacamd_eia:
epacems:
production_doi: 10.5281/zenodo.10233185
sandbox_doi: 10.5072/zenodo.12943
epapcap:
production_doi: 10.5281/zenodo.14757598
#sandbox_doi: # Update!!
ferc1:
production_doi: 10.5281/zenodo.4127043
sandbox_doi: 10.5072/zenodo.3267
Expand Down