From b09c22e0e7181288e1becbb47aff2a96a23eee85 Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Thu, 30 Jan 2025 16:28:02 -0500 Subject: [PATCH 01/10] Add metadata for NREL standard scenarios --- src/pudl_archiver/metadata/sources.py | 36 +++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py index 411735df..650bcb4c 100644 --- a/src/pudl_archiver/metadata/sources.py +++ b/src/pudl_archiver/metadata/sources.py @@ -416,4 +416,40 @@ "license_pudl": LICENSES["cc-by-4.0"], "contributors": [CONTRIBUTORS["catalyst-cooperative"]], }, + "nrelss": { + "title": "NREL Standard Scenarios", + "path": "https://www.nrel.gov/analysis/standard-scenarios.html", + "description": ( + "NREL's Standard Scenarios are a suite of forward-looking scenarios of the U.S." + "power sector that are updated annually to support and inform energy analysis." + "The Standard Scenarios are simulated using the Regional Energy Deployment System" + "and Distributed Generation Market Demand Model capacity expansion models and are" + "updated each year to provide timely information regarding power sector evolution." + "The scenarios have been designed to capture a range of possible power system" + "futures and consider a variety of factors from high vehicle electrification to" + "major cost declines for electricity generation technologies (e.g., using cost" + "inputs from the Annual Technology Baseline)." + "For select scenarios, the models are run using the PLEXOS software and the" + "Cambium tool that assembles structured data sets of hourly cost, emissions, and" + "operational data for modeled futures. Results are available using the Scenario" + "Viewer and Data Downloader." + ), + "source_file_dict": { + "source_format": "CSV", + }, + "working_partitions": { + "years": list(range(2016, 2025)), + }, + "contributors": [ + CONTRIBUTORS["catalyst-cooperative"], + ], + "keywords": sorted( + { + "nrel", + "standard scenarios", + } + ), + "license_raw": LICENSES["cc-by-4.0"], + "license_pudl": LICENSES["cc-by-4.0"], + }, } From 07f4697a239ea2fb9129a0d60ba17d8076fbc8db Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Thu, 30 Jan 2025 17:57:23 -0500 Subject: [PATCH 02/10] Split get_hyperlinks so we can run it on text as well as on a url --- src/pudl_archiver/archivers/classes.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/pudl_archiver/archivers/classes.py b/src/pudl_archiver/archivers/classes.py index 217a8ebe..728830e0 100644 --- a/src/pudl_archiver/archivers/classes.py +++ b/src/pudl_archiver/archivers/classes.py @@ -260,7 +260,6 @@ async def get_hyperlinks( headers: Additional headers to send in the GET request. """ # Parse web page to get all hyperlinks - parser = _HyperlinkExtractor() response = await retry_async( self.session.get, @@ -271,6 +270,25 @@ async def get_hyperlinks( }, ) text = await retry_async(response.text) + return self.get_hyperlinks_from_text(text, filter_pattern) + + def get_hyperlinks_from_text( + self, + text: str, + filter_pattern: typing.Pattern | None = None, + ) -> list[str]: + """Return all hyperlinks from HTML text. + + This is a helper-helper function to perform very basic HTML-parsing functionality. + It extracts all hyperlinks from an HTML text, and returns those that match + a specified pattern. This means it can find all hyperlinks that look like + a download link to a single data resource. + + Args: + text: text containing HTML. + filter_pattern: If present, only return links that contain pattern. + """ + parser = _HyperlinkExtractor() parser.feed(text) # Filter to those that match filter_pattern @@ -282,7 +300,7 @@ async def get_hyperlinks( if not hyperlinks: self.logger.warning( f"The archiver couldn't find any hyperlinks{('that match: ' + filter_pattern.pattern) if filter_pattern else ''}." - f"Make sure your filter_pattern is correct, check if the structure of the {url} page changed, or if you are missing HTTP headers." + f"Make sure your filter_pattern is correct, and check if the structure of the page is not what you expect it to be." ) return hyperlinks From 7f902d916b5d8c92fec0f41668c05573b442b382 Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Thu, 30 Jan 2025 18:00:13 -0500 Subject: [PATCH 03/10] Add new archiver for NREL Standard Scenarios --- src/pudl_archiver/archivers/nrelss.py | 117 ++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 src/pudl_archiver/archivers/nrelss.py diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py new file mode 100644 index 00000000..dcd3b3f1 --- /dev/null +++ b/src/pudl_archiver/archivers/nrelss.py @@ -0,0 +1,117 @@ +"""Download NREL Standard Scenarios data.""" + +import re + +from pudl_archiver.archivers.classes import ( + AbstractDatasetArchiver, + ArchiveAwaitable, + ResourceInfo, +) +from pudl_archiver.frictionless import ZipLayout +from pudl_archiver.utils import retry_async + +# The citation field for Standard Scenarios 2021 is blank, but they linked to the +# 2021 report from the description of one of the other available projects, so we're +# able to hard-code it for now: +REPORT_2021 = "https://www.nrel.gov/docs/fy22osti/80641.pdf" + + +class NrelStandardScenariosArchiver(AbstractDatasetArchiver): + """NREL Standard Scenarios archiver.""" + + name = "nrelss" + + async def get_resources(self) -> ArchiveAwaitable: + """Download NREL Standard Scenarios resources.""" + + async def post_to_json(url, **kwargs): + resp = await retry_async(self.session.post, [url], data=kwargs) + return await retry_async(resp.json) + + project_year_pattern = re.compile(r"Standard Scenarios (?P\d{4})") + report_url_pattern = re.compile( + r"http://www.nrel.gov/docs/(?Pfy\d{2}osti)/(?P\d{5}\.pdf)" + ) + filename_pattern = re.compile(r"/([^/?]*/.csv)") + + project_records = self.get_json("https://scenarioviewer.nrel.gov/api/projects/") + for scenario_project in ( + p for p in project_records if p["name"].startswith("Standard Scenarios") + ): + project_uuid = scenario_project["uuid"] + m = project_year_pattern.search(scenario_project["name"]) + if not m: + continue + project_year = int(m.group("year")) + + if scenario_project["citation"]: + report_link = self.get_hyperlinks_from_text( + scenario_project["citation"], report_url_pattern + ).pop() + elif project_year == 2021: + report_link = REPORT_2021 + m = report_url_pattern.search(report_link) + if not m: + raise AssertionError( + f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:" + f"{scenario_project}" + ) + download_links = {f"{m.group('fy')}_{m.group('number')}": report_link} + file_list = post_to_json( + "https://scenarioviewer.nrel.gov/api/file-list/", + project_uuid=project_uuid, + ) + for file_record in ( + f for f in file_list["files"] if f["file_type"] == "CSV" + ): + file_resp = await retry_async( + self.session.post, + ["https://scenarioviewer.nrel.gov/api/download/"], + data={"project_uuid": project_uuid, "file_ids": file_record["id"]}, + ) + file_headers = file_resp.headers() + download_filename = f"{file_record['location_type']}.csv" + + m = filename_pattern.search(file_headers["Location"]) + if m: + download_filename = m.groups(1) + else: + # this will give us e.g. + # (for 2023-2024) "ALL Transmission Capacities.csv" "ALL States.csv" + # (for previous years) "Electrification Nations.csv" "High Natural Gas Prices States.csv" + download_filename = ( + f"{file_record['scenario']} {file_record['location_type']}.csv" + ) + + download_links[download_filename] = file_headers["Location"] + yield self.get_year_resource(download_links, project_year) + + async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceInfo: + """Download all available data for a year. + + Resulting resource contains one pdf of the scenario report, and a set of CSVs for different scenarios and geo levels. + + Args: + links: filename->URL mapping for files to download + year: the year we're downloading data for + """ + zip_path = self.download_directory / f"{self.name}-{year}.zip" + data_paths_in_archive = set() + for filename, link in sorted(links.items()): + self.logger.info(f"Downloading {filename} from {link}") + download_path = self.download_directory / filename + await self.download_file(link, download_path) + self.add_to_archive( + zip_path=zip_path, + filename=filename, + blob=download_path.open("rb"), + ) + data_paths_in_archive.add(filename) + # Don't want to leave multiple giant files on disk, so delete + # immediately after they're safely stored in the ZIP + download_path.unlink() + return ResourceInfo( + local_path=zip_path, + partitions={"years": year}, + layout=ZipLayout(file_paths=data_paths_in_archive), + ) From 24844d4792ded01aed5dc2038af6d3b61514933d Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Fri, 31 Jan 2025 15:04:55 -0500 Subject: [PATCH 04/10] [wip] getting working emergency --- src/pudl_archiver/archivers/nrelss.py | 26 +++++++++++++++++++------- src/pudl_archiver/metadata/sources.py | 4 ++-- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py index dcd3b3f1..5c3f48a8 100644 --- a/src/pudl_archiver/archivers/nrelss.py +++ b/src/pudl_archiver/archivers/nrelss.py @@ -25,16 +25,16 @@ async def get_resources(self) -> ArchiveAwaitable: """Download NREL Standard Scenarios resources.""" async def post_to_json(url, **kwargs): - resp = await retry_async(self.session.post, [url], data=kwargs) + resp = await retry_async(self.session.post, [url], kwargs={"data":kwargs}) return await retry_async(resp.json) project_year_pattern = re.compile(r"Standard Scenarios (?P\d{4})") report_url_pattern = re.compile( - r"http://www.nrel.gov/docs/(?Pfy\d{2}osti)/(?P\d{5}\.pdf)" + r"https://www.nrel.gov/docs/(?Pfy\d{2}osti)/(?P\d{5}\.pdf)" ) filename_pattern = re.compile(r"/([^/?]*/.csv)") - project_records = self.get_json("https://scenarioviewer.nrel.gov/api/projects/") + project_records = await self.get_json("https://scenarioviewer.nrel.gov/api/projects/") for scenario_project in ( p for p in project_records if p["name"].startswith("Standard Scenarios") ): @@ -47,7 +47,14 @@ async def post_to_json(url, **kwargs): if scenario_project["citation"]: report_link = self.get_hyperlinks_from_text( scenario_project["citation"], report_url_pattern - ).pop() + ) + if report_link: + report_link = report_link.pop() + else: + raise AssertionError( + f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:" + f"{scenario_project}" + ) elif project_year == 2021: report_link = REPORT_2021 m = report_url_pattern.search(report_link) @@ -57,7 +64,7 @@ async def post_to_json(url, **kwargs): f"{scenario_project}" ) download_links = {f"{m.group('fy')}_{m.group('number')}": report_link} - file_list = post_to_json( + file_list = await post_to_json( "https://scenarioviewer.nrel.gov/api/file-list/", project_uuid=project_uuid, ) @@ -67,11 +74,16 @@ async def post_to_json(url, **kwargs): file_resp = await retry_async( self.session.post, ["https://scenarioviewer.nrel.gov/api/download/"], - data={"project_uuid": project_uuid, "file_ids": file_record["id"]}, + kwargs={ + "data":{"project_uuid": project_uuid, "file_ids": file_record["id"]}, + "kwargs":{"allow_redirects":False}}, ) - file_headers = file_resp.headers() + file_headers = file_resp.headers download_filename = f"{file_record['location_type']}.csv" + if "Location" not in file_headers: + for h in file_headers: + print(f"{h}: {file_headers[h]}") m = filename_pattern.search(file_headers["Location"]) if m: download_filename = m.groups(1) diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py index 650bcb4c..10ee33e5 100644 --- a/src/pudl_archiver/metadata/sources.py +++ b/src/pudl_archiver/metadata/sources.py @@ -2,7 +2,7 @@ from typing import Any -from pudl.metadata.constants import CONTRIBUTORS, LICENSES +from pudl.metadata.constants import CONTRIBUTORS, LICENSES, KEYWORDS # To add a new contributor, follow the following format to add an entry to the # ADDL_CONTRIBUTORS dictionary below formatted like this: @@ -447,7 +447,7 @@ { "nrel", "standard scenarios", - } + } #+ KEYWORDS["us_govt"] + KEYWORDS["electricity"] ), "license_raw": LICENSES["cc-by-4.0"], "license_pudl": LICENSES["cc-by-4.0"], From e08bb8d7d2342d0486e428d24f4dc51d8aeda63e Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Fri, 31 Jan 2025 15:30:00 -0500 Subject: [PATCH 05/10] running --- src/pudl_archiver/archivers/nrelss.py | 113 ++++++++++++++++++-------- 1 file changed, 79 insertions(+), 34 deletions(-) diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py index 5c3f48a8..df85ea1e 100644 --- a/src/pudl_archiver/archivers/nrelss.py +++ b/src/pudl_archiver/archivers/nrelss.py @@ -1,5 +1,9 @@ """Download NREL Standard Scenarios data.""" +import aiohttp +from contextlib import nullcontext +import io +from pathlib import Path import re from pudl_archiver.archivers.classes import ( @@ -15,6 +19,13 @@ # able to hard-code it for now: REPORT_2021 = "https://www.nrel.gov/docs/fy22osti/80641.pdf" +async def _download_file_post( + session: aiohttp.ClientSession, url: str, file: Path | io.BytesIO, **kwargs +): + async with session.post(url, **kwargs) as response: + with file.open("wb") if isinstance(file, Path) else nullcontext(file) as f: + async for chunk in response.content.iter_chunked(1024): + f.write(chunk) class NrelStandardScenariosArchiver(AbstractDatasetArchiver): """NREL Standard Scenarios archiver.""" @@ -63,42 +74,50 @@ async def post_to_json(url, **kwargs): f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:" f"{scenario_project}" ) - download_links = {f"{m.group('fy')}_{m.group('number')}": report_link} + file_list = await post_to_json( "https://scenarioviewer.nrel.gov/api/file-list/", project_uuid=project_uuid, ) - for file_record in ( - f for f in file_list["files"] if f["file_type"] == "CSV" - ): - file_resp = await retry_async( - self.session.post, - ["https://scenarioviewer.nrel.gov/api/download/"], - kwargs={ - "data":{"project_uuid": project_uuid, "file_ids": file_record["id"]}, - "kwargs":{"allow_redirects":False}}, - ) - file_headers = file_resp.headers - download_filename = f"{file_record['location_type']}.csv" - - if "Location" not in file_headers: - for h in file_headers: - print(f"{h}: {file_headers[h]}") - m = filename_pattern.search(file_headers["Location"]) - if m: - download_filename = m.groups(1) - else: - # this will give us e.g. - # (for 2023-2024) "ALL Transmission Capacities.csv" "ALL States.csv" - # (for previous years) "Electrification Nations.csv" "High Natural Gas Prices States.csv" - download_filename = ( - f"{file_record['scenario']} {file_record['location_type']}.csv" - ) - - download_links[download_filename] = file_headers["Location"] - yield self.get_year_resource(download_links, project_year) + # for file_record in ( +# +# ): +# file_resp = await retry_async( +# self.session.post, +# ["https://scenarioviewer.nrel.gov/api/download/"], +# kwargs={ +# "data":{"project_uuid": project_uuid, "file_ids": file_record["id"]}, +# "kwargs":{"allow_redirects":False}}, +# ) +# file_headers = file_resp.headers +# download_filename = f"{file_record['location_type']}.csv" +# +# if "Location" not in file_headers: +# for h in file_headers: +# print(f"{h}: {file_headers[h]}") +# m = filename_pattern.search(file_headers["Location"]) +# if m: +# download_filename = m.groups(1) +# else: +# # this will give us e.g. +# # (for 2023-2024) "ALL Transmission Capacities.csv" "ALL States.csv" +# # (for previous years) "Electrification Nations.csv" "High Natural Gas Prices States.csv" +# download_filename = ( +# f"{file_record['scenario']} {file_record['location_type']}.csv" +# ) +# +# download_links[download_filename] = file_headers["Location"] + yield self.get_year_resource( + report=(f"{m.group('fy')}_{m.group('number')}", report_link), + uuid=project_uuid, + file_ids=[ + (f["id"], f"NRELSS {project_year} {f['scenario']} {f['location_type']}.csv".replace(" ","_")) + for f in file_list["files"] if f["file_type"] == "CSV" + ], + year=project_year + ) - async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceInfo: + async def get_year_resource(self, report, uuid, file_ids, year: int) -> ResourceInfo: """Download all available data for a year. Resulting resource contains one pdf of the scenario report, and a set of CSVs for different scenarios and geo levels. @@ -109,10 +128,36 @@ async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceI """ zip_path = self.download_directory / f"{self.name}-{year}.zip" data_paths_in_archive = set() - for filename, link in sorted(links.items()): - self.logger.info(f"Downloading {filename} from {link}") + # report + self.logger.info(f"Downloading report {report[0]} from {report[1]}") + download_path = self.download_directory / report[0] + await self.download_file(report[1], download_path) + self.add_to_archive( + zip_path=zip_path, + filename=report[0], + blob=download_path.open("rb"), + ) + data_paths_in_archive.add(report[0]) + # Don't want to leave multiple giant files on disk, so delete + # immediately after they're safely stored in the ZIP + download_path.unlink() + + for file_id,filename in file_ids: + self.logger.info(f"Downloading file {file_id} {uuid}") +# file_resp = await retry_async( +# self.session.post, +# ["https://scenarioviewer.nrel.gov/api/download/"], +# kwargs={ +# "data":{"project_uuid": project_uuid, "file_ids": file_record["id"]}, +# "kwargs":{"allow_redirects":False}}, +# ) download_path = self.download_directory / filename - await self.download_file(link, download_path) + await retry_async( + _download_file_post, + [self.session, "https://scenarioviewer.nrel.gov/api/download/", download_path], + kwargs={"data":{"project_uuid": uuid, "file_ids": file_id}} + ) +# await self.download_file(link, download_path) self.add_to_archive( zip_path=zip_path, filename=filename, From 49a2974ae0d57edfce4ae71de072753e5d97535b Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Fri, 31 Jan 2025 15:38:27 -0500 Subject: [PATCH 06/10] fix 2020 --- src/pudl_archiver/archivers/nrelss.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py index df85ea1e..176066b7 100644 --- a/src/pudl_archiver/archivers/nrelss.py +++ b/src/pudl_archiver/archivers/nrelss.py @@ -112,7 +112,7 @@ async def post_to_json(url, **kwargs): uuid=project_uuid, file_ids=[ (f["id"], f"NRELSS {project_year} {f['scenario']} {f['location_type']}.csv".replace(" ","_")) - for f in file_list["files"] if f["file_type"] == "CSV" + for f in file_list["files"] if (f["file_type"] == "CSV" or project_year == 2020) ], year=project_year ) @@ -129,7 +129,7 @@ async def get_year_resource(self, report, uuid, file_ids, year: int) -> Resource zip_path = self.download_directory / f"{self.name}-{year}.zip" data_paths_in_archive = set() # report - self.logger.info(f"Downloading report {report[0]} from {report[1]}") + self.logger.info(f"Downloading report {year} {report[0]} from {report[1]}") download_path = self.download_directory / report[0] await self.download_file(report[1], download_path) self.add_to_archive( @@ -143,7 +143,7 @@ async def get_year_resource(self, report, uuid, file_ids, year: int) -> Resource download_path.unlink() for file_id,filename in file_ids: - self.logger.info(f"Downloading file {file_id} {uuid}") + self.logger.info(f"Downloading file {year} {file_id} {uuid}") # file_resp = await retry_async( # self.session.post, # ["https://scenarioviewer.nrel.gov/api/download/"], From 7e7b2113a7fe5748571f77bd9a8a32c23659b78e Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Fri, 31 Jan 2025 15:56:59 -0500 Subject: [PATCH 07/10] fixed bad file extension --- src/pudl_archiver/archivers/nrelss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py index 176066b7..b0cc2eca 100644 --- a/src/pudl_archiver/archivers/nrelss.py +++ b/src/pudl_archiver/archivers/nrelss.py @@ -111,7 +111,7 @@ async def post_to_json(url, **kwargs): report=(f"{m.group('fy')}_{m.group('number')}", report_link), uuid=project_uuid, file_ids=[ - (f["id"], f"NRELSS {project_year} {f['scenario']} {f['location_type']}.csv".replace(" ","_")) + (f["id"], f"NRELSS {project_year} {f['scenario']} {f['location_type']}.{f['file_type']}".replace(" ","_").lower()) for f in file_list["files"] if (f["file_type"] == "CSV" or project_year == 2020) ], year=project_year From 0709e376c6240f8dfdd72db3dae80e4f5ed33089 Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Fri, 31 Jan 2025 16:26:54 -0500 Subject: [PATCH 08/10] fix troublesome filename characters --- src/pudl_archiver/archivers/nrelss.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py index b0cc2eca..7333bbcf 100644 --- a/src/pudl_archiver/archivers/nrelss.py +++ b/src/pudl_archiver/archivers/nrelss.py @@ -111,7 +111,10 @@ async def post_to_json(url, **kwargs): report=(f"{m.group('fy')}_{m.group('number')}", report_link), uuid=project_uuid, file_ids=[ - (f["id"], f"NRELSS {project_year} {f['scenario']} {f['location_type']}.{f['file_type']}".replace(" ","_").lower()) + ( + f["id"], + f"NRELSS {project_year} {f['scenario']} {f['location_type']}.{f['file_type']}".replace(" ","_").replace("%","pct").replace(",","").lower() + ) for f in file_list["files"] if (f["file_type"] == "CSV" or project_year == 2020) ], year=project_year From 3b9e2d219f99bea351084acfe0be89462a7fab8d Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Fri, 31 Jan 2025 16:43:17 -0500 Subject: [PATCH 09/10] successful run --- src/pudl_archiver/archivers/nrelss.py | 38 +-------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py index 7333bbcf..1800bc28 100644 --- a/src/pudl_archiver/archivers/nrelss.py +++ b/src/pudl_archiver/archivers/nrelss.py @@ -79,34 +79,6 @@ async def post_to_json(url, **kwargs): "https://scenarioviewer.nrel.gov/api/file-list/", project_uuid=project_uuid, ) - # for file_record in ( -# -# ): -# file_resp = await retry_async( -# self.session.post, -# ["https://scenarioviewer.nrel.gov/api/download/"], -# kwargs={ -# "data":{"project_uuid": project_uuid, "file_ids": file_record["id"]}, -# "kwargs":{"allow_redirects":False}}, -# ) -# file_headers = file_resp.headers -# download_filename = f"{file_record['location_type']}.csv" -# -# if "Location" not in file_headers: -# for h in file_headers: -# print(f"{h}: {file_headers[h]}") -# m = filename_pattern.search(file_headers["Location"]) -# if m: -# download_filename = m.groups(1) -# else: -# # this will give us e.g. -# # (for 2023-2024) "ALL Transmission Capacities.csv" "ALL States.csv" -# # (for previous years) "Electrification Nations.csv" "High Natural Gas Prices States.csv" -# download_filename = ( -# f"{file_record['scenario']} {file_record['location_type']}.csv" -# ) -# -# download_links[download_filename] = file_headers["Location"] yield self.get_year_resource( report=(f"{m.group('fy')}_{m.group('number')}", report_link), uuid=project_uuid, @@ -147,20 +119,12 @@ async def get_year_resource(self, report, uuid, file_ids, year: int) -> Resource for file_id,filename in file_ids: self.logger.info(f"Downloading file {year} {file_id} {uuid}") -# file_resp = await retry_async( -# self.session.post, -# ["https://scenarioviewer.nrel.gov/api/download/"], -# kwargs={ -# "data":{"project_uuid": project_uuid, "file_ids": file_record["id"]}, -# "kwargs":{"allow_redirects":False}}, -# ) download_path = self.download_directory / filename await retry_async( _download_file_post, [self.session, "https://scenarioviewer.nrel.gov/api/download/", download_path], kwargs={"data":{"project_uuid": uuid, "file_ids": file_id}} ) -# await self.download_file(link, download_path) self.add_to_archive( zip_path=zip_path, filename=filename, @@ -173,5 +137,5 @@ async def get_year_resource(self, report, uuid, file_ids, year: int) -> Resource return ResourceInfo( local_path=zip_path, partitions={"years": year}, - layout=ZipLayout(file_paths=data_paths_in_archive), + #layout=ZipLayout(file_paths=data_paths_in_archive), # can't use ZipLayout bc these CSVs have a multi-row header and pandas throws a tantrum ) From d8e3d2f42b541907a75578ec5c72ba48c711b7df Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 31 Jan 2025 21:43:55 +0000 Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks For more information, see https://pre-commit.ci --- src/pudl_archiver/archivers/nrelss.py | 52 +++++++++++++++++---------- src/pudl_archiver/metadata/sources.py | 4 +-- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py index 1800bc28..4b24a7da 100644 --- a/src/pudl_archiver/archivers/nrelss.py +++ b/src/pudl_archiver/archivers/nrelss.py @@ -1,17 +1,17 @@ """Download NREL Standard Scenarios data.""" -import aiohttp -from contextlib import nullcontext import io -from pathlib import Path import re +from contextlib import nullcontext +from pathlib import Path + +import aiohttp from pudl_archiver.archivers.classes import ( AbstractDatasetArchiver, ArchiveAwaitable, ResourceInfo, ) -from pudl_archiver.frictionless import ZipLayout from pudl_archiver.utils import retry_async # The citation field for Standard Scenarios 2021 is blank, but they linked to the @@ -19,6 +19,7 @@ # able to hard-code it for now: REPORT_2021 = "https://www.nrel.gov/docs/fy22osti/80641.pdf" + async def _download_file_post( session: aiohttp.ClientSession, url: str, file: Path | io.BytesIO, **kwargs ): @@ -27,6 +28,7 @@ async def _download_file_post( async for chunk in response.content.iter_chunked(1024): f.write(chunk) + class NrelStandardScenariosArchiver(AbstractDatasetArchiver): """NREL Standard Scenarios archiver.""" @@ -36,7 +38,7 @@ async def get_resources(self) -> ArchiveAwaitable: """Download NREL Standard Scenarios resources.""" async def post_to_json(url, **kwargs): - resp = await retry_async(self.session.post, [url], kwargs={"data":kwargs}) + resp = await retry_async(self.session.post, [url], kwargs={"data": kwargs}) return await retry_async(resp.json) project_year_pattern = re.compile(r"Standard Scenarios (?P\d{4})") @@ -45,7 +47,9 @@ async def post_to_json(url, **kwargs): ) filename_pattern = re.compile(r"/([^/?]*/.csv)") - project_records = await self.get_json("https://scenarioviewer.nrel.gov/api/projects/") + project_records = await self.get_json( + "https://scenarioviewer.nrel.gov/api/projects/" + ) for scenario_project in ( p for p in project_records if p["name"].startswith("Standard Scenarios") ): @@ -74,7 +78,7 @@ async def post_to_json(url, **kwargs): f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:" f"{scenario_project}" ) - + file_list = await post_to_json( "https://scenarioviewer.nrel.gov/api/file-list/", project_uuid=project_uuid, @@ -84,15 +88,23 @@ async def post_to_json(url, **kwargs): uuid=project_uuid, file_ids=[ ( - f["id"], - f"NRELSS {project_year} {f['scenario']} {f['location_type']}.{f['file_type']}".replace(" ","_").replace("%","pct").replace(",","").lower() + f["id"], + f"NRELSS {project_year} {f['scenario']} {f['location_type']}.{f['file_type']}".replace( + " ", "_" + ) + .replace("%", "pct") + .replace(",", "") + .lower(), ) - for f in file_list["files"] if (f["file_type"] == "CSV" or project_year == 2020) + for f in file_list["files"] + if (f["file_type"] == "CSV" or project_year == 2020) ], - year=project_year + year=project_year, ) - async def get_year_resource(self, report, uuid, file_ids, year: int) -> ResourceInfo: + async def get_year_resource( + self, report, uuid, file_ids, year: int + ) -> ResourceInfo: """Download all available data for a year. Resulting resource contains one pdf of the scenario report, and a set of CSVs for different scenarios and geo levels. @@ -116,14 +128,18 @@ async def get_year_resource(self, report, uuid, file_ids, year: int) -> Resource # Don't want to leave multiple giant files on disk, so delete # immediately after they're safely stored in the ZIP download_path.unlink() - - for file_id,filename in file_ids: + + for file_id, filename in file_ids: self.logger.info(f"Downloading file {year} {file_id} {uuid}") download_path = self.download_directory / filename await retry_async( - _download_file_post, - [self.session, "https://scenarioviewer.nrel.gov/api/download/", download_path], - kwargs={"data":{"project_uuid": uuid, "file_ids": file_id}} + _download_file_post, + [ + self.session, + "https://scenarioviewer.nrel.gov/api/download/", + download_path, + ], + kwargs={"data": {"project_uuid": uuid, "file_ids": file_id}}, ) self.add_to_archive( zip_path=zip_path, @@ -137,5 +153,5 @@ async def get_year_resource(self, report, uuid, file_ids, year: int) -> Resource return ResourceInfo( local_path=zip_path, partitions={"years": year}, - #layout=ZipLayout(file_paths=data_paths_in_archive), # can't use ZipLayout bc these CSVs have a multi-row header and pandas throws a tantrum + # layout=ZipLayout(file_paths=data_paths_in_archive), # can't use ZipLayout bc these CSVs have a multi-row header and pandas throws a tantrum ) diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py index 10ee33e5..cae8609b 100644 --- a/src/pudl_archiver/metadata/sources.py +++ b/src/pudl_archiver/metadata/sources.py @@ -2,7 +2,7 @@ from typing import Any -from pudl.metadata.constants import CONTRIBUTORS, LICENSES, KEYWORDS +from pudl.metadata.constants import CONTRIBUTORS, LICENSES # To add a new contributor, follow the following format to add an entry to the # ADDL_CONTRIBUTORS dictionary below formatted like this: @@ -447,7 +447,7 @@ { "nrel", "standard scenarios", - } #+ KEYWORDS["us_govt"] + KEYWORDS["electricity"] + } # + KEYWORDS["us_govt"] + KEYWORDS["electricity"] ), "license_raw": LICENSES["cc-by-4.0"], "license_pudl": LICENSES["cc-by-4.0"],