Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Archive NREL Electrification Futures Studies #564

Draft
wants to merge 5 commits into
base: marianneke-generalize-hyperlink-extractor
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 205 additions & 0 deletions src/pudl_archiver/archivers/nrelefs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
"""Download NREL Electrification Futures Study data."""

import re

from pudl_archiver.archivers.classes import (
AbstractDatasetArchiver,
ArchiveAwaitable,
ResourceInfo,
)

# Main page
# https://www.nrel.gov/analysis/electrification-futures.html

# Grab all data sites with the following formats
# https://data.nrel.gov/submissions/90
# https://data.openei.org/submissions/4130

# Also grab all PDFs on the main page
BASE_URL = "https://www.nrel.gov/analysis/electrification-futures.html"


class NrelEFSArchiver(AbstractDatasetArchiver):
"""NREL Electrification Futures Studies archiver."""

name = "nrelefs"

async def get_resources(self) -> ArchiveAwaitable:
"""Download NREL EFS resources.

The main page links to a series of PDFs as well as data.nrel.gov and data.openei.org webpages
containing associated data for each report.
"""
# Hard-code a dictionary of each version of the study, with a short-hand
# description of the report as the key and the links to all data and reports
# in the version as the values. This was last published in 2021 so we don't
# expect these to change.

version_dict = {
"cost-and-performance": [
"https://www.nrel.gov/docs/fy18osti/70485.pdf",
"https://data.nrel.gov/submissions/93",
"https://data.nrel.gov/submissions/78",
],
"demand-side-scenarios": [
"https://www.nrel.gov/docs/fy18osti/71500.pdf",
"https://www.nrel.gov/docs/fy18osti/72096.pdf",
"https://www.nrel.gov/docs/fy18osti/72311.pdf",
"https://data.nrel.gov/submissions/90",
"https://data.nrel.gov/submissions/92",
],
"dsgrid-model": [
"https://www.nrel.gov/docs/fy18osti/71492.pdf",
"https://www.nrel.gov/docs/fy18osti/72388.pdf",
"https://data.openei.org/submissions/4130",
],
"load-profiles": [
"https://www.nrel.gov/docs/fy20osti/73336.pdf",
"https://data.nrel.gov/submissions/126",
"https://data.nrel.gov/submissions/127",
],
"supply-side-scenarios": [
"https://www.nrel.gov/docs/fy21osti/72330.pdf",
"https://www.nrel.gov/docs/fy21osti/78783.pdf",
"https://data.nrel.gov/submissions/157",
],
"detailed-grid-simulations": [
"https://www.nrel.gov/docs/fy21osti/79094.pdf",
"https://www.nrel.gov/docs/fy21osti/80167.pdf",
],
}

# Though we hardcode the links above, we also grab the PDFs links from the page
# in order to get information about the name ascribed to the link to make it
# easier to label each PDF something informative
pdf_pattern = re.compile(r"\/docs\/fy(\d{2})osti\/\w*.pdf")
pdf_links = await self.get_hyperlinks(BASE_URL, pdf_pattern)

# For each version, yield a method that will produce one zipfile containing
# all the files for the method
for version, links in version_dict.items():
yield self.get_version_resource(
version=version, links=links, pdf_links=pdf_links
)

async def get_version_resource(
self,
version: str,
links: list[str],
pdf_links: list[dict[str, str]],
) -> ResourceInfo:
"""Download all available data for a given version of an EFS study.

Resulting resource contains one zip file of all PDFs, .zip, .xlsx, .gzip, .csv.gzip
for a given version of the EFS studies. We handle the DS Grid specially because
the file

Args:
version: shorthand name for the given version
links: a list of links that contain data for this version.
pdf_links: a list of all PDF links found on the EFS homepage, with the title
of the link. We use this to rename the PDFs to something more informative
than the original file title.
"""
# Set up zipfile name and list of files in zip
zipfile_path = self.download_directory / f"nrelefs-{version}.zip"
data_paths_in_archive = set()

# Compile pattern for all datasets on data.nrel.gov
data_pattern = re.compile(
r"files\/([\w\/]*)\/([\w \-%]*)(.zip|.xlsx|.gzip|.csv.gzip)$"
)

# Compile dictionary and regex pattern for DSGrid special case
dsgrid_dict = {
"dsgrid-site-energy-state-hourly": "https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=dsgrid-2018-efs%2Fdsgrid_site_energy_state_hourly%2F",
"raw-complete": "https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=dsgrid-2018-efs%2Fraw_complete%2F",
"state-hourly-residuals": "https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=dsgrid-2018-efs%2Fstate_hourly_residuals%2F",
}
dsg_pattern = re.compile(r"[\w]*.dsg$")

for link in links:
# First, get all the PDFs
if link.endswith(".pdf"):
matching_pdf_link = [key for key in pdf_links if key in link]
# Get the corresponding filename from pdf_links
if matching_pdf_link:
link_key = matching_pdf_link.pop()
filename = pdf_links[link_key] # TODO: Debug this
# Clean the filename to name the PDF something more informative than
# the link name
self.logger.info(f"Downloading {link}")
filename = (
filename.lower()
.replace("\n", "")
.replace("electrification futures study:", "")
)
filename = re.sub(
"[^a-zA-Z0-9 -]+", "", filename
).strip() # Remove all non-word, digit space or - characters
filename = re.sub(
r"\s+", "-", filename
) # Replace 1+ space with a dash
filename = f"nrelefs-{version}-{filename}.pdf"
await self.download_add_to_archive_and_unlink(
url=link, filename=filename, zip_path=zipfile_path
)
data_paths_in_archive.add(filename)
else:
# Alert us to expected but missing PDF links.
raise AssertionError(
f"Expected PDF link {link} but this wasn't found in {BASE_URL}. Has the home page changed?"
)

# Next, get all the data files from data.nrel.gov
elif "data.nrel.gov/submissions/" in link:
self.logger.info(f"Downloading data files from {link}.")
data_links = await self.get_hyperlinks(link, data_pattern)
for data_link, filename in data_links.items():
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zaneselvans See this

matches = data_pattern.search(data_link)
if not matches:
continue
# Grab file name and extension
filename = matches.group(2)
file_ext = matches.group(3)

# Reformat filename
filename = filename.lower().replace("_", "-").replace("%20", "-")
filename = re.sub(
"[^a-zA-Z0-9 -]+", "", filename
).strip() # Remove all non-word, digit space or - characters
filename = re.sub(r"[\s-]+", "-", filename)
filename = re.sub(
r"^efs-", "", filename
) # We add this back with an nrel header
filename = f"nrelefs-{version}-{filename}{file_ext}"
self.logger.info(
f"Downloading {data_link} as {filename} to {zipfile_path}."
)
await self.download_add_to_archive_and_unlink(
url=data_link, filename=filename, zip_path=zipfile_path
)
data_paths_in_archive.add(filename)

elif "data.openei.org" in link: # Finally, handle DSGrid data
self.logger.info("Downloading DSGrid data files.")
# Iterate through each type of DSGrid data and download
for data_type, dsg_link in dsgrid_dict.items():
dsg_file_links = await self.get_hyperlinks(dsg_link, dsg_pattern)
for dsg_link, filename in dsg_file_links.items():
filename = filename.replace("_", "-")
filename = f"nrelesg-{data_type}-{filename}"
await self.download_add_to_archive_and_unlink(
url=dsg_link, filename=filename, zip_path=zipfile_path
)
data_paths_in_archive.add(filename)

else:
# Raise error for mysterious other links
raise AssertionError(f"Unexpected format for link {link} in {version}.")

return ResourceInfo(
local_path=zipfile_path,
partitions={"version": version},
# layout=ZipLayout(file_paths=data_paths_in_archive),
)
23 changes: 23 additions & 0 deletions src/pudl_archiver/metadata/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,4 +416,27 @@
"license_pudl": LICENSES["cc-by-4.0"],
"contributors": [CONTRIBUTORS["catalyst-cooperative"]],
},
"nrelefs": {
"title": "NREL EFS -- Electrification Futures Study",
"path": "https://www.nrel.gov/analysis/electrification-futures.html",
"description": (
"The Electrification Futures Study (EFS) is a multi-year study conducted by NREL "
"and its research partners—Electric Power Research Institute, Evolved Energy Research, "
"Lawrence Berkeley National Laboratory, Northern Arizona University, and Oak Ridge National "
"Laboratory. EFS used multiple analytic tools and models to develop and assess "
"electrification scenarios designed to quantify potential energy, economic, "
"and environmental impacts to the U.S. power system and broader economy. There are six reports "
"comprising the EFS, with the final report released in May 2021."
),
"working_partitions": {
"report_number": set(range(1, 7)),
"document_type": ["data", "technical_report", "presentation"],
},
"keywords": sorted(
{"doe", "lead", "low income", "energy affordability", "energy burden"}
),
"license_raw": LICENSES["us-govt"],
"license_pudl": LICENSES["cc-by-4.0"],
"contributors": [CONTRIBUTORS["catalyst-cooperative"]],
},
}