Skip to content

Commit

Permalink
Merge pull request #544 from catalyst-cooperative/epapcap
Browse files Browse the repository at this point in the history
Archive EPA PCAP data
  • Loading branch information
e-belfer authored Jan 28, 2025
2 parents 7ccf771 + 8a4d117 commit bdbfa3e
Show file tree
Hide file tree
Showing 7 changed files with 98 additions and 2 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/run-archiver.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
inputs:
datasets:
description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").'
default: '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
default: '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
required: true
type: string
create_github_issue:
Expand All @@ -26,7 +26,7 @@ jobs:
strategy:
matrix:
# Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here.
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
fail-fast: false
runs-on: ubuntu-latest
permissions:
Expand Down
1 change: 1 addition & 0 deletions src/pudl_archiver/archivers/epa/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Module implements archivers for all EPA forms integrated into PUDL."""
File renamed without changes.
File renamed without changes.
74 changes: 74 additions & 0 deletions src/pudl_archiver/archivers/epa/epapcap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""Download EPA PCAP data."""

import re
from pathlib import Path

from pudl_archiver.archivers.classes import (
AbstractDatasetArchiver,
ArchiveAwaitable,
ResourceInfo,
)
from pudl_archiver.frictionless import ZipLayout

BASE_URL = (
"https://www.epa.gov/inflation-reduction-act/priority-climate-action-plan-directory"
)
DATA_TABLE_URLS = [
"https://www.epa.gov/inflation-reduction-act/ghg-inventory-searchable-table",
"https://www.epa.gov/inflation-reduction-act/ghg-reduction-measures-searchable-table",
"https://www.epa.gov/inflation-reduction-act/co-pollutant-benefits-searchable-table",
"https://www.epa.gov/inflation-reduction-act/lidac-benefits-searchable-table",
]


class EpaPcapArchiver(AbstractDatasetArchiver):
"""EPA PCAP archiver."""

name = "epapcap"

async def get_resources(self) -> ArchiveAwaitable:
"""Download EPA PCAP resources."""
yield self.get_resource()

async def get_resource(self) -> ResourceInfo:
"""Download EPA PCAP resources."""
zip_path = self.download_directory / "epapcap.zip"
data_paths_in_archive = set()
# Download the three Excel files first
excel_pattern = re.compile(r"priority.*\.xlsx")
for link in await self.get_hyperlinks(BASE_URL, excel_pattern):
await self.download_helper(link, zip_path, data_paths_in_archive)

# Download all PDFs from each searchable table
pdf_pattern = re.compile(r".*\.pdf")
for data_table_url in DATA_TABLE_URLS:
for link in await self.get_hyperlinks(data_table_url, pdf_pattern):
# The second and third searchable tables links are relative
# to the TLD, so we convert them to absolute links
prefix = "https://www.epa.gov"
if not link.startswith("http"):
link = prefix + link
await self.download_helper(link, zip_path, data_paths_in_archive)

return ResourceInfo(
local_path=zip_path,
partitions={},
laybout=ZipLayout(file_paths=data_paths_in_archive),
)

async def download_helper(self, link, zip_path, data_paths_in_archive):
"""Download file and add to archive."""
filename = Path(link).name
# Do nothing if we're going to end up duplicating a file
# Many of the PDFs are shared between the multiple searchable tables
if filename in data_paths_in_archive:
return
download_path = self.download_directory / filename
await self.download_file(link, download_path)
self.add_to_archive(
zip_path=zip_path,
filename=filename,
blob=download_path.open("rb"),
)
data_paths_in_archive.add(filename)
download_path.unlink()
18 changes: 18 additions & 0 deletions src/pudl_archiver/metadata/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,4 +398,22 @@
"license_pudl": LICENSES["cc-by-4.0"],
"contributors": [CONTRIBUTORS["catalyst-cooperative"]],
},
"epapcap": {
"title": "EPA PCAP -- Priority Climate Action Plan",
"path": "https://www.epa.gov/inflation-reduction-act/priority-climate-action-plan-directory",
"description": (
"EPA’s Priority Climate Action Plan (PCAP) Directory organizes data collected from 211 "
"PCAPs submitted by states, Metropolitan Statistical Areas (MSAs), Tribes, and territories "
"under EPA’s Climate Pollution Reduction Grants (CPRG) program. PCAPs are a compilation "
"of each jurisdiction’s identified priority actions (or measures) to reduce greenhouse "
"gas (GHG) emissions. The directory presents information from more than 30 data categories "
"related to GHG inventories, GHG reduction measures, benefits for low-income and "
"disadvantaged communities (LIDACs), and other PCAP elements."
),
"working_partitions": {},
"keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg"}),
"license_raw": LICENSES["us-govt"],
"license_pudl": LICENSES["cc-by-4.0"],
"contributors": [CONTRIBUTORS["catalyst-cooperative"]],
},
}
3 changes: 3 additions & 0 deletions src/pudl_archiver/package_data/zenodo_doi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ epacamd_eia:
epacems:
production_doi: 10.5281/zenodo.10233185
sandbox_doi: 10.5072/zenodo.12943
epapcap:
production_doi: 10.5281/zenodo.14757598
#sandbox_doi: # Update!!
ferc1:
production_doi: 10.5281/zenodo.4127043
sandbox_doi: 10.5072/zenodo.3267
Expand Down

0 comments on commit bdbfa3e

Please sign in to comment.