Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Archive EPA PCAP data #544

Merged
merged 11 commits into from
Jan 28, 2025
74 changes: 74 additions & 0 deletions src/pudl_archiver/archivers/epapcap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""Download EPA PCAP data."""

import re
from pathlib import Path

from pudl_archiver.archivers.classes import (
AbstractDatasetArchiver,
ArchiveAwaitable,
ResourceInfo,
)
from pudl_archiver.frictionless import ZipLayout

BASE_URL = (
"https://www.epa.gov/inflation-reduction-act/priority-climate-action-plan-directory"
)
DATA_TABLE_URLS = [
"https://www.epa.gov/inflation-reduction-act/ghg-inventory-searchable-table",
"https://www.epa.gov/inflation-reduction-act/ghg-reduction-measures-searchable-table",
"https://www.epa.gov/inflation-reduction-act/co-pollutant-benefits-searchable-table",
"https://www.epa.gov/inflation-reduction-act/lidac-benefits-searchable-table",
]


class EpaPcapArchiver(AbstractDatasetArchiver):
"""EPA PCAP archiver."""

name = "epapcap"

async def get_resources(self) -> ArchiveAwaitable:
"""Download EPA PCAP resources."""
yield self.get_resource()

async def get_resource(self) -> ResourceInfo:
"""Download EPA PCAP resources."""
zip_path = self.download_directory / "epapcap.zip"
data_paths_in_archive = set()
# Download the three Excel files first
excel_pattern = re.compile(r"priority.*\.xlsx")
for link in await self.get_hyperlinks(BASE_URL, excel_pattern):
await self.download_helper(link, zip_path, data_paths_in_archive)

# Download all PDFs from each searchable table
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow, great that this was relatively straightforward. I was worried it would be a huge pain.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All @nilaykumar 's fine handywork! 🚀

pdf_pattern = re.compile(r".*\.pdf")
for data_table_url in DATA_TABLE_URLS:
for link in await self.get_hyperlinks(data_table_url, pdf_pattern):
# The second and third searchable tables links are relative
# to the TLD, so we convert them to absolute links
prefix = "https://www.epa.gov"
if not link.startswith("http"):
link = prefix + link
await self.download_helper(link, zip_path, data_paths_in_archive)

return ResourceInfo(
local_path=zip_path,
partitions={},
laybout=ZipLayout(file_paths=data_paths_in_archive),
)

async def download_helper(self, link, zip_path, data_paths_in_archive):
"""Download file and add to archive."""
filename = Path(link).name
# Do nothing if we're going to end up duplicating a file
# Many of the PDFs are shared between the multiple searchable tables
if filename in data_paths_in_archive:
return
download_path = self.download_directory / filename
await self.download_file(link, download_path)
self.add_to_archive(
zip_path=zip_path,
filename=filename,
blob=download_path.open("rb"),
)
data_paths_in_archive.add(filename)
download_path.unlink()
18 changes: 18 additions & 0 deletions src/pudl_archiver/metadata/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,4 +398,22 @@
"license_pudl": LICENSES["cc-by-4.0"],
"contributors": [CONTRIBUTORS["catalyst-cooperative"]],
},
"epapcap": {
"title": "EPA -- Priority Climate Action Plan",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the past we've put the abbreviated name before the double-dash and then expanding the name after it. Are the abbreviations intentionally being left out? They appear to be present in some of the new dataset titles, but missing in others.

Suggested change
"title": "EPA -- Priority Climate Action Plan",
"title": "EPA PCAP -- Priority Climate Action Plan",

"path": "https://www.epa.gov/inflation-reduction-act/priority-climate-action-plan-directory",
"description": (
"EPA’s Priority Climate Action Plan (PCAP) Directory organizes data collected from 211 "
"PCAPs submitted by states, Metropolitan Statistical Areas (MSAs), Tribes, and territories "
"under EPA’s Climate Pollution Reduction Grants (CPRG) program. PCAPs are a compilation "
"of each jurisdiction’s identified priority actions (or measures) to reduce greenhouse "
"gas (GHG) emissions. The directory presents information from more than 30 data categories "
"related to GHG inventories, GHG reduction measures, benefits for low-income and "
"disadvantaged communities (LIDACs), and other PCAP elements."
),
"working_partitions": {},
"keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg", "emissions"}),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove dupe of "emissions"

Suggested change
"keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg", "emissions"}),
"keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg"}),

"license_raw": LICENSES["us-govt"],
"license_pudl": LICENSES["cc-by-4.0"],
"contributors": [CONTRIBUTORS["catalyst-cooperative"]],
},
}
Loading