-
-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Archive EPA PCAP data #544
Changes from 3 commits
f769887
aeda8b2
71ad55c
4822ea3
a8e1d70
535f8ab
1ec4b24
429f525
ea01aa8
4d92918
8a4d117
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
"""Download EPA PCAP data.""" | ||
|
||
import re | ||
from pathlib import Path | ||
|
||
from pudl_archiver.archivers.classes import ( | ||
AbstractDatasetArchiver, | ||
ArchiveAwaitable, | ||
ResourceInfo, | ||
) | ||
from pudl_archiver.frictionless import ZipLayout | ||
|
||
BASE_URL = ( | ||
"https://www.epa.gov/inflation-reduction-act/priority-climate-action-plan-directory" | ||
) | ||
DATA_TABLE_URLS = [ | ||
"https://www.epa.gov/inflation-reduction-act/ghg-inventory-searchable-table", | ||
"https://www.epa.gov/inflation-reduction-act/ghg-reduction-measures-searchable-table", | ||
"https://www.epa.gov/inflation-reduction-act/co-pollutant-benefits-searchable-table", | ||
"https://www.epa.gov/inflation-reduction-act/lidac-benefits-searchable-table", | ||
] | ||
|
||
|
||
class EpaPcapArchiver(AbstractDatasetArchiver): | ||
"""EPA PCAP archiver.""" | ||
|
||
name = "epapcap" | ||
|
||
async def get_resources(self) -> ArchiveAwaitable: | ||
"""Download EPA PCAP resources.""" | ||
yield self.get_resource() | ||
|
||
async def get_resource(self) -> ResourceInfo: | ||
"""Download EPA PCAP resources.""" | ||
zip_path = self.download_directory / "epapcap.zip" | ||
data_paths_in_archive = set() | ||
# Download the three Excel files first | ||
excel_pattern = re.compile(r"priority.*\.xlsx") | ||
for link in await self.get_hyperlinks(BASE_URL, excel_pattern): | ||
await self.download_helper(link, zip_path, data_paths_in_archive) | ||
|
||
# Download all PDFs from each searchable table | ||
pdf_pattern = re.compile(r".*\.pdf") | ||
for data_table_url in DATA_TABLE_URLS: | ||
for link in await self.get_hyperlinks(data_table_url, pdf_pattern): | ||
# The second and third searchable tables links are relative | ||
# to the TLD, so we convert them to absolute links | ||
prefix = "https://www.epa.gov" | ||
if not link.startswith("http"): | ||
link = prefix + link | ||
await self.download_helper(link, zip_path, data_paths_in_archive) | ||
|
||
return ResourceInfo( | ||
local_path=zip_path, | ||
partitions={}, | ||
laybout=ZipLayout(file_paths=data_paths_in_archive), | ||
) | ||
|
||
async def download_helper(self, link, zip_path, data_paths_in_archive): | ||
"""Download file and add to archive.""" | ||
filename = Path(link).name | ||
# Do nothing if we're going to end up duplicating a file | ||
# Many of the PDFs are shared between the multiple searchable tables | ||
if filename in data_paths_in_archive: | ||
return | ||
download_path = self.download_directory / filename | ||
await self.download_file(link, download_path) | ||
self.add_to_archive( | ||
zip_path=zip_path, | ||
filename=filename, | ||
blob=download_path.open("rb"), | ||
) | ||
data_paths_in_archive.add(filename) | ||
download_path.unlink() |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -398,4 +398,22 @@ | |||||
"license_pudl": LICENSES["cc-by-4.0"], | ||||||
"contributors": [CONTRIBUTORS["catalyst-cooperative"]], | ||||||
}, | ||||||
"epapcap": { | ||||||
"title": "EPA -- Priority Climate Action Plan", | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the past we've put the abbreviated name before the double-dash and then expanding the name after it. Are the abbreviations intentionally being left out? They appear to be present in some of the new dataset titles, but missing in others.
Suggested change
|
||||||
"path": "https://www.epa.gov/inflation-reduction-act/priority-climate-action-plan-directory", | ||||||
"description": ( | ||||||
"EPA’s Priority Climate Action Plan (PCAP) Directory organizes data collected from 211 " | ||||||
"PCAPs submitted by states, Metropolitan Statistical Areas (MSAs), Tribes, and territories " | ||||||
"under EPA’s Climate Pollution Reduction Grants (CPRG) program. PCAPs are a compilation " | ||||||
"of each jurisdiction’s identified priority actions (or measures) to reduce greenhouse " | ||||||
"gas (GHG) emissions. The directory presents information from more than 30 data categories " | ||||||
"related to GHG inventories, GHG reduction measures, benefits for low-income and " | ||||||
"disadvantaged communities (LIDACs), and other PCAP elements." | ||||||
), | ||||||
"working_partitions": {}, | ||||||
"keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg", "emissions"}), | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove dupe of "emissions"
Suggested change
|
||||||
"license_raw": LICENSES["us-govt"], | ||||||
"license_pudl": LICENSES["cc-by-4.0"], | ||||||
"contributors": [CONTRIBUTORS["catalyst-cooperative"]], | ||||||
}, | ||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wow, great that this was relatively straightforward. I was worried it would be a huge pain.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All @nilaykumar 's fine handywork! 🚀