-
-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add archiver for NREL Standard Scenarios #563
base: main
Are you sure you want to change the base?
Changes from all commits
b09c22e
07f4697
7f902d9
24844d4
e08bb8d
49a2974
7e7b211
0709e37
3b9e2d2
d8e3d2f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
"""Download NREL Standard Scenarios data.""" | ||
|
||
import io | ||
import re | ||
from contextlib import nullcontext | ||
from pathlib import Path | ||
|
||
import aiohttp | ||
|
||
from pudl_archiver.archivers.classes import ( | ||
AbstractDatasetArchiver, | ||
ArchiveAwaitable, | ||
ResourceInfo, | ||
) | ||
from pudl_archiver.utils import retry_async | ||
|
||
# The citation field for Standard Scenarios 2021 is blank, but they linked to the | ||
# 2021 report from the description of one of the other available projects, so we're | ||
# able to hard-code it for now: | ||
REPORT_2021 = "https://www.nrel.gov/docs/fy22osti/80641.pdf" | ||
|
||
|
||
async def _download_file_post( | ||
session: aiohttp.ClientSession, url: str, file: Path | io.BytesIO, **kwargs | ||
): | ||
async with session.post(url, **kwargs) as response: | ||
with file.open("wb") if isinstance(file, Path) else nullcontext(file) as f: | ||
async for chunk in response.content.iter_chunked(1024): | ||
f.write(chunk) | ||
|
||
|
||
class NrelStandardScenariosArchiver(AbstractDatasetArchiver): | ||
"""NREL Standard Scenarios archiver.""" | ||
|
||
name = "nrelss" | ||
|
||
async def get_resources(self) -> ArchiveAwaitable: | ||
"""Download NREL Standard Scenarios resources.""" | ||
|
||
async def post_to_json(url, **kwargs): | ||
resp = await retry_async(self.session.post, [url], kwargs={"data": kwargs}) | ||
return await retry_async(resp.json) | ||
|
||
project_year_pattern = re.compile(r"Standard Scenarios (?P<year>\d{4})") | ||
report_url_pattern = re.compile( | ||
r"https://www.nrel.gov/docs/(?P<fy>fy\d{2}osti)/(?P<number>\d{5}\.pdf)" | ||
) | ||
filename_pattern = re.compile(r"/([^/?]*/.csv)") | ||
|
||
project_records = await self.get_json( | ||
"https://scenarioviewer.nrel.gov/api/projects/" | ||
) | ||
for scenario_project in ( | ||
p for p in project_records if p["name"].startswith("Standard Scenarios") | ||
): | ||
project_uuid = scenario_project["uuid"] | ||
m = project_year_pattern.search(scenario_project["name"]) | ||
if not m: | ||
continue | ||
project_year = int(m.group("year")) | ||
|
||
if scenario_project["citation"]: | ||
report_link = self.get_hyperlinks_from_text( | ||
scenario_project["citation"], report_url_pattern | ||
) | ||
if report_link: | ||
report_link = report_link.pop() | ||
else: | ||
raise AssertionError( | ||
f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:" | ||
f"{scenario_project}" | ||
) | ||
elif project_year == 2021: | ||
report_link = REPORT_2021 | ||
m = report_url_pattern.search(report_link) | ||
if not m: | ||
raise AssertionError( | ||
f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:" | ||
f"{scenario_project}" | ||
) | ||
|
||
file_list = await post_to_json( | ||
"https://scenarioviewer.nrel.gov/api/file-list/", | ||
project_uuid=project_uuid, | ||
) | ||
yield self.get_year_resource( | ||
report=(f"{m.group('fy')}_{m.group('number')}", report_link), | ||
uuid=project_uuid, | ||
file_ids=[ | ||
( | ||
f["id"], | ||
f"NRELSS {project_year} {f['scenario']} {f['location_type']}.{f['file_type']}".replace( | ||
" ", "_" | ||
) | ||
.replace("%", "pct") | ||
.replace(",", "") | ||
.lower(), | ||
) | ||
for f in file_list["files"] | ||
if (f["file_type"] == "CSV" or project_year == 2020) | ||
], | ||
year=project_year, | ||
) | ||
|
||
async def get_year_resource( | ||
self, report, uuid, file_ids, year: int | ||
) -> ResourceInfo: | ||
"""Download all available data for a year. | ||
|
||
Resulting resource contains one pdf of the scenario report, and a set of CSVs for different scenarios and geo levels. | ||
|
||
Args: | ||
links: filename->URL mapping for files to download | ||
year: the year we're downloading data for | ||
""" | ||
zip_path = self.download_directory / f"{self.name}-{year}.zip" | ||
data_paths_in_archive = set() | ||
# report | ||
self.logger.info(f"Downloading report {year} {report[0]} from {report[1]}") | ||
download_path = self.download_directory / report[0] | ||
await self.download_file(report[1], download_path) | ||
self.add_to_archive( | ||
zip_path=zip_path, | ||
filename=report[0], | ||
blob=download_path.open("rb"), | ||
) | ||
data_paths_in_archive.add(report[0]) | ||
# Don't want to leave multiple giant files on disk, so delete | ||
# immediately after they're safely stored in the ZIP | ||
download_path.unlink() | ||
|
||
for file_id, filename in file_ids: | ||
self.logger.info(f"Downloading file {year} {file_id} {uuid}") | ||
download_path = self.download_directory / filename | ||
await retry_async( | ||
_download_file_post, | ||
[ | ||
self.session, | ||
"https://scenarioviewer.nrel.gov/api/download/", | ||
download_path, | ||
], | ||
kwargs={"data": {"project_uuid": uuid, "file_ids": file_id}}, | ||
) | ||
self.add_to_archive( | ||
zip_path=zip_path, | ||
filename=filename, | ||
blob=download_path.open("rb"), | ||
) | ||
data_paths_in_archive.add(filename) | ||
# Don't want to leave multiple giant files on disk, so delete | ||
# immediately after they're safely stored in the ZIP | ||
download_path.unlink() | ||
return ResourceInfo( | ||
local_path=zip_path, | ||
partitions={"years": year}, | ||
# layout=ZipLayout(file_paths=data_paths_in_archive), # can't use ZipLayout bc these CSVs have a multi-row header and pandas throws a tantrum | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -416,4 +416,40 @@ | |
"license_pudl": LICENSES["cc-by-4.0"], | ||
"contributors": [CONTRIBUTORS["catalyst-cooperative"]], | ||
}, | ||
"nrelss": { | ||
"title": "NREL Standard Scenarios", | ||
"path": "https://www.nrel.gov/analysis/standard-scenarios.html", | ||
"description": ( | ||
"NREL's Standard Scenarios are a suite of forward-looking scenarios of the U.S." | ||
"power sector that are updated annually to support and inform energy analysis." | ||
"The Standard Scenarios are simulated using the Regional Energy Deployment System" | ||
"and Distributed Generation Market Demand Model capacity expansion models and are" | ||
"updated each year to provide timely information regarding power sector evolution." | ||
"The scenarios have been designed to capture a range of possible power system" | ||
"futures and consider a variety of factors from high vehicle electrification to" | ||
"major cost declines for electricity generation technologies (e.g., using cost" | ||
"inputs from the Annual Technology Baseline)." | ||
"For select scenarios, the models are run using the PLEXOS software and the" | ||
"Cambium tool that assembles structured data sets of hourly cost, emissions, and" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could consider pulling in the cambium results as well (as a second partition) but A) they only go back to 2020, and B) they're like 6GB for each year There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If its simple to also add in the camdium results i'd say add em & add a second partition of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are the camdium results 6 GB zipped? if so w/ the standard scenarios (assuming they are a similar size) its pushing up against the 50 GB archive limit There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah zipped. The standard scenarios are like two orders of magnitude smaller since they don't include hourly data though, so it's less "will cambium push this archiver over the limit" and more "can we archive cambium on zenodo at all" I'll go with "not right now" and write up cambium as a separate issue There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tiny formatting issue -- on multi-line concatenated strings like this if we don't add an explicit space at the end of each line in the quotes, they'll run together. Triple quoted strings are another option. |
||
"operational data for modeled futures. Results are available using the Scenario" | ||
"Viewer and Data Downloader." | ||
), | ||
"source_file_dict": { | ||
"source_format": "CSV", | ||
}, | ||
"working_partitions": { | ||
"years": list(range(2016, 2025)), | ||
krivard marked this conversation as resolved.
Show resolved
Hide resolved
|
||
}, | ||
"contributors": [ | ||
CONTRIBUTORS["catalyst-cooperative"], | ||
], | ||
"keywords": sorted( | ||
{ | ||
"nrel", | ||
"standard scenarios", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Other keywords that could go in here, cribbed from
|
||
} # + KEYWORDS["us_govt"] + KEYWORDS["electricity"] | ||
), | ||
"license_raw": LICENSES["cc-by-4.0"], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They have a weirdo disclaimer that says (approximately) "you have to cite us but you can't make it look like we endorse you" which seems close enough to cc-by? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would ask our resident license scrutinizer @zaneselvans on this one! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For data from NREL my trick for finding the license is to look and see whether they have added it to the OEDI Data Swamp which requires whoever is archiving it to add an explicit, well-defined license. The 2024 Standard Scenarios are there and have been released under CC-BY-4.0. I don't see any of the earlier years obviously though. |
||
"license_pudl": LICENSES["cc-by-4.0"], | ||
}, | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe I have carefully sliced this to not conflict with Marianne's
get_hyperlink
changes but I'll handle any massaging necessary if not