Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add archiver for NREL Standard Scenarios #563

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions src/pudl_archiver/archivers/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,6 @@ async def get_hyperlinks(
headers: Additional headers to send in the GET request.
"""
# Parse web page to get all hyperlinks
parser = _HyperlinkExtractor()

response = await retry_async(
self.session.get,
Expand All @@ -271,6 +270,25 @@ async def get_hyperlinks(
},
)
text = await retry_async(response.text)
return self.get_hyperlinks_from_text(text, filter_pattern)

def get_hyperlinks_from_text(
self,
text: str,
filter_pattern: typing.Pattern | None = None,
) -> list[str]:
"""Return all hyperlinks from HTML text.

This is a helper-helper function to perform very basic HTML-parsing functionality.
It extracts all hyperlinks from an HTML text, and returns those that match
a specified pattern. This means it can find all hyperlinks that look like
a download link to a single data resource.

Args:
text: text containing HTML.
filter_pattern: If present, only return links that contain pattern.
"""
parser = _HyperlinkExtractor()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe I have carefully sliced this to not conflict with Marianne's get_hyperlink changes but I'll handle any massaging necessary if not

parser.feed(text)

# Filter to those that match filter_pattern
Expand All @@ -282,7 +300,7 @@ async def get_hyperlinks(
if not hyperlinks:
self.logger.warning(
f"The archiver couldn't find any hyperlinks{('that match: ' + filter_pattern.pattern) if filter_pattern else ''}."
f"Make sure your filter_pattern is correct, check if the structure of the {url} page changed, or if you are missing HTTP headers."
f"Make sure your filter_pattern is correct, and check if the structure of the page is not what you expect it to be."
)

return hyperlinks
Expand Down
157 changes: 157 additions & 0 deletions src/pudl_archiver/archivers/nrelss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
"""Download NREL Standard Scenarios data."""

import io
import re
from contextlib import nullcontext
from pathlib import Path

import aiohttp

from pudl_archiver.archivers.classes import (
AbstractDatasetArchiver,
ArchiveAwaitable,
ResourceInfo,
)
from pudl_archiver.utils import retry_async

# The citation field for Standard Scenarios 2021 is blank, but they linked to the
# 2021 report from the description of one of the other available projects, so we're
# able to hard-code it for now:
REPORT_2021 = "https://www.nrel.gov/docs/fy22osti/80641.pdf"


async def _download_file_post(
session: aiohttp.ClientSession, url: str, file: Path | io.BytesIO, **kwargs
):
async with session.post(url, **kwargs) as response:
with file.open("wb") if isinstance(file, Path) else nullcontext(file) as f:
async for chunk in response.content.iter_chunked(1024):
f.write(chunk)


class NrelStandardScenariosArchiver(AbstractDatasetArchiver):
"""NREL Standard Scenarios archiver."""

name = "nrelss"

async def get_resources(self) -> ArchiveAwaitable:
"""Download NREL Standard Scenarios resources."""

async def post_to_json(url, **kwargs):
resp = await retry_async(self.session.post, [url], kwargs={"data": kwargs})
return await retry_async(resp.json)

project_year_pattern = re.compile(r"Standard Scenarios (?P<year>\d{4})")
report_url_pattern = re.compile(
r"https://www.nrel.gov/docs/(?P<fy>fy\d{2}osti)/(?P<number>\d{5}\.pdf)"
)
filename_pattern = re.compile(r"/([^/?]*/.csv)")

project_records = await self.get_json(
"https://scenarioviewer.nrel.gov/api/projects/"
)
for scenario_project in (
p for p in project_records if p["name"].startswith("Standard Scenarios")
):
project_uuid = scenario_project["uuid"]
m = project_year_pattern.search(scenario_project["name"])
if not m:
continue
project_year = int(m.group("year"))

if scenario_project["citation"]:
report_link = self.get_hyperlinks_from_text(
scenario_project["citation"], report_url_pattern
)
if report_link:
report_link = report_link.pop()
else:
raise AssertionError(
f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:"
f"{scenario_project}"
)
elif project_year == 2021:
report_link = REPORT_2021
m = report_url_pattern.search(report_link)
if not m:
raise AssertionError(
f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:"
f"{scenario_project}"
)

file_list = await post_to_json(
"https://scenarioviewer.nrel.gov/api/file-list/",
project_uuid=project_uuid,
)
yield self.get_year_resource(
report=(f"{m.group('fy')}_{m.group('number')}", report_link),
uuid=project_uuid,
file_ids=[
(
f["id"],
f"NRELSS {project_year} {f['scenario']} {f['location_type']}.{f['file_type']}".replace(
" ", "_"
)
.replace("%", "pct")
.replace(",", "")
.lower(),
)
for f in file_list["files"]
if (f["file_type"] == "CSV" or project_year == 2020)
],
year=project_year,
)

async def get_year_resource(
self, report, uuid, file_ids, year: int
) -> ResourceInfo:
"""Download all available data for a year.

Resulting resource contains one pdf of the scenario report, and a set of CSVs for different scenarios and geo levels.

Args:
links: filename->URL mapping for files to download
year: the year we're downloading data for
"""
zip_path = self.download_directory / f"{self.name}-{year}.zip"
data_paths_in_archive = set()
# report
self.logger.info(f"Downloading report {year} {report[0]} from {report[1]}")
download_path = self.download_directory / report[0]
await self.download_file(report[1], download_path)
self.add_to_archive(
zip_path=zip_path,
filename=report[0],
blob=download_path.open("rb"),
)
data_paths_in_archive.add(report[0])
# Don't want to leave multiple giant files on disk, so delete
# immediately after they're safely stored in the ZIP
download_path.unlink()

for file_id, filename in file_ids:
self.logger.info(f"Downloading file {year} {file_id} {uuid}")
download_path = self.download_directory / filename
await retry_async(
_download_file_post,
[
self.session,
"https://scenarioviewer.nrel.gov/api/download/",
download_path,
],
kwargs={"data": {"project_uuid": uuid, "file_ids": file_id}},
)
self.add_to_archive(
zip_path=zip_path,
filename=filename,
blob=download_path.open("rb"),
)
data_paths_in_archive.add(filename)
# Don't want to leave multiple giant files on disk, so delete
# immediately after they're safely stored in the ZIP
download_path.unlink()
return ResourceInfo(
local_path=zip_path,
partitions={"years": year},
# layout=ZipLayout(file_paths=data_paths_in_archive), # can't use ZipLayout bc these CSVs have a multi-row header and pandas throws a tantrum
)
36 changes: 36 additions & 0 deletions src/pudl_archiver/metadata/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,4 +416,40 @@
"license_pudl": LICENSES["cc-by-4.0"],
"contributors": [CONTRIBUTORS["catalyst-cooperative"]],
},
"nrelss": {
"title": "NREL Standard Scenarios",
"path": "https://www.nrel.gov/analysis/standard-scenarios.html",
"description": (
"NREL's Standard Scenarios are a suite of forward-looking scenarios of the U.S."
"power sector that are updated annually to support and inform energy analysis."
"The Standard Scenarios are simulated using the Regional Energy Deployment System"
"and Distributed Generation Market Demand Model capacity expansion models and are"
"updated each year to provide timely information regarding power sector evolution."
"The scenarios have been designed to capture a range of possible power system"
"futures and consider a variety of factors from high vehicle electrification to"
"major cost declines for electricity generation technologies (e.g., using cost"
"inputs from the Annual Technology Baseline)."
"For select scenarios, the models are run using the PLEXOS software and the"
"Cambium tool that assembles structured data sets of hourly cost, emissions, and"
Copy link
Contributor Author

@krivard krivard Jan 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could consider pulling in the cambium results as well (as a second partition) but A) they only go back to 2020, and B) they're like 6GB for each year

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If its simple to also add in the camdium results i'd say add em & add a second partition of project or scenario_type or something! but this seems like a lower priority than grabbing just the standard scenarios.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are the camdium results 6 GB zipped? if so w/ the standard scenarios (assuming they are a similar size) its pushing up against the 50 GB archive limit

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah zipped. The standard scenarios are like two orders of magnitude smaller since they don't include hourly data though, so it's less "will cambium push this archiver over the limit" and more "can we archive cambium on zenodo at all"

I'll go with "not right now" and write up cambium as a separate issue

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tiny formatting issue -- on multi-line concatenated strings like this if we don't add an explicit space at the end of each line in the quotes, they'll run together. Triple quoted strings are another option.

"operational data for modeled futures. Results are available using the Scenario"
"Viewer and Data Downloader."
),
"source_file_dict": {
"source_format": "CSV",
},
"working_partitions": {
"years": list(range(2016, 2025)),
krivard marked this conversation as resolved.
Show resolved Hide resolved
},
"contributors": [
CONTRIBUTORS["catalyst-cooperative"],
],
"keywords": sorted(
{
"nrel",
"standard scenarios",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other keywords that could go in here, cribbed from pudl/metadata/sources.py@nrelatb:

                + KEYWORDS["us_govt"]
                + KEYWORDS["electricity"]

} # + KEYWORDS["us_govt"] + KEYWORDS["electricity"]
),
"license_raw": LICENSES["cc-by-4.0"],
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They have a weirdo disclaimer that says (approximately) "you have to cite us but you can't make it look like we endorse you" which seems close enough to cc-by?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would ask our resident license scrutinizer @zaneselvans on this one!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For data from NREL my trick for finding the license is to look and see whether they have added it to the OEDI Data Swamp which requires whoever is archiving it to add an explicit, well-defined license.

The 2024 Standard Scenarios are there and have been released under CC-BY-4.0. I don't see any of the earlier years obviously though.

"license_pudl": LICENSES["cc-by-4.0"],
},
}
Loading