From b09c22e0e7181288e1becbb47aff2a96a23eee85 Mon Sep 17 00:00:00 2001
From: Kathryn Mazaitis <kathryn.mazaitis@catalyst.coop>
Date: Thu, 30 Jan 2025 16:28:02 -0500
Subject: [PATCH 01/10] Add metadata for NREL standard scenarios

---
 src/pudl_archiver/metadata/sources.py | 36 +++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py
index 411735df..650bcb4c 100644
--- a/src/pudl_archiver/metadata/sources.py
+++ b/src/pudl_archiver/metadata/sources.py
@@ -416,4 +416,40 @@
         "license_pudl": LICENSES["cc-by-4.0"],
         "contributors": [CONTRIBUTORS["catalyst-cooperative"]],
     },
+    "nrelss": {
+        "title": "NREL Standard Scenarios",
+        "path": "https://www.nrel.gov/analysis/standard-scenarios.html",
+        "description": (
+            "NREL's Standard Scenarios are a suite of forward-looking scenarios of the U.S."
+            "power sector that are updated annually to support and inform energy analysis."
+            "The Standard Scenarios are simulated using the Regional Energy Deployment System"
+            "and Distributed Generation Market Demand Model capacity expansion models and are"
+            "updated each year to provide timely information regarding power sector evolution."
+            "The scenarios have been designed to capture a range of possible power system"
+            "futures and consider a variety of factors from high vehicle electrification to"
+            "major cost declines for electricity generation technologies (e.g., using cost"
+            "inputs from the Annual Technology Baseline)."
+            "For select scenarios, the models are run using the PLEXOS software and the"
+            "Cambium tool that assembles structured data sets of hourly cost, emissions, and"
+            "operational data for modeled futures. Results are available using the Scenario"
+            "Viewer and Data Downloader."
+        ),
+        "source_file_dict": {
+            "source_format": "CSV",
+        },
+        "working_partitions": {
+            "years": list(range(2016, 2025)),
+        },
+        "contributors": [
+            CONTRIBUTORS["catalyst-cooperative"],
+        ],
+        "keywords": sorted(
+            {
+                "nrel",
+                "standard scenarios",
+            }
+        ),
+        "license_raw": LICENSES["cc-by-4.0"],
+        "license_pudl": LICENSES["cc-by-4.0"],
+    },
 }

From 07f4697a239ea2fb9129a0d60ba17d8076fbc8db Mon Sep 17 00:00:00 2001
From: Kathryn Mazaitis <kathryn.mazaitis@catalyst.coop>
Date: Thu, 30 Jan 2025 17:57:23 -0500
Subject: [PATCH 02/10] Split get_hyperlinks so we can run it on text as well
 as on a url

---
 src/pudl_archiver/archivers/classes.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/pudl_archiver/archivers/classes.py b/src/pudl_archiver/archivers/classes.py
index 217a8ebe..728830e0 100644
--- a/src/pudl_archiver/archivers/classes.py
+++ b/src/pudl_archiver/archivers/classes.py
@@ -260,7 +260,6 @@ async def get_hyperlinks(
             headers: Additional headers to send in the GET request.
         """
         # Parse web page to get all hyperlinks
-        parser = _HyperlinkExtractor()
 
         response = await retry_async(
             self.session.get,
@@ -271,6 +270,25 @@ async def get_hyperlinks(
             },
         )
         text = await retry_async(response.text)
+        return self.get_hyperlinks_from_text(text, filter_pattern)
+
+    def get_hyperlinks_from_text(
+        self,
+        text: str,
+        filter_pattern: typing.Pattern | None = None,
+    ) -> list[str]:
+        """Return all hyperlinks from HTML text.
+
+        This is a helper-helper function to perform very basic HTML-parsing functionality.
+        It extracts all hyperlinks from an HTML text, and returns those that match
+        a specified pattern. This means it can find all hyperlinks that look like
+        a download link to a single data resource.
+
+        Args:
+            text: text containing HTML.
+            filter_pattern: If present, only return links that contain pattern.
+        """
+        parser = _HyperlinkExtractor()
         parser.feed(text)
 
         # Filter to those that match filter_pattern
@@ -282,7 +300,7 @@ async def get_hyperlinks(
         if not hyperlinks:
             self.logger.warning(
                 f"The archiver couldn't find any hyperlinks{('that match: ' + filter_pattern.pattern) if filter_pattern else ''}."
-                f"Make sure your filter_pattern is correct, check if the structure of the {url} page changed, or if you are missing HTTP headers."
+                f"Make sure your filter_pattern is correct, and check if the structure of the page is not what you expect it to be."
             )
 
         return hyperlinks

From 7f902d916b5d8c92fec0f41668c05573b442b382 Mon Sep 17 00:00:00 2001
From: Kathryn Mazaitis <kathryn.mazaitis@catalyst.coop>
Date: Thu, 30 Jan 2025 18:00:13 -0500
Subject: [PATCH 03/10] Add new archiver for NREL Standard Scenarios

---
 src/pudl_archiver/archivers/nrelss.py | 117 ++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 src/pudl_archiver/archivers/nrelss.py

diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py
new file mode 100644
index 00000000..dcd3b3f1
--- /dev/null
+++ b/src/pudl_archiver/archivers/nrelss.py
@@ -0,0 +1,117 @@
+"""Download NREL Standard Scenarios data."""
+
+import re
+
+from pudl_archiver.archivers.classes import (
+    AbstractDatasetArchiver,
+    ArchiveAwaitable,
+    ResourceInfo,
+)
+from pudl_archiver.frictionless import ZipLayout
+from pudl_archiver.utils import retry_async
+
+# The citation field for Standard Scenarios 2021 is blank, but they linked to the
+# 2021 report from the description of one of the other available projects, so we're
+# able to hard-code it for now:
+REPORT_2021 = "https://www.nrel.gov/docs/fy22osti/80641.pdf"
+
+
+class NrelStandardScenariosArchiver(AbstractDatasetArchiver):
+    """NREL Standard Scenarios archiver."""
+
+    name = "nrelss"
+
+    async def get_resources(self) -> ArchiveAwaitable:
+        """Download NREL Standard Scenarios resources."""
+
+        async def post_to_json(url, **kwargs):
+            resp = await retry_async(self.session.post, [url], data=kwargs)
+            return await retry_async(resp.json)
+
+        project_year_pattern = re.compile(r"Standard Scenarios (?P<year>\d{4})")
+        report_url_pattern = re.compile(
+            r"http://www.nrel.gov/docs/(?P<fy>fy\d{2}osti)/(?P<number>\d{5}\.pdf)"
+        )
+        filename_pattern = re.compile(r"/([^/?]*/.csv)")
+
+        project_records = self.get_json("https://scenarioviewer.nrel.gov/api/projects/")
+        for scenario_project in (
+            p for p in project_records if p["name"].startswith("Standard Scenarios")
+        ):
+            project_uuid = scenario_project["uuid"]
+            m = project_year_pattern.search(scenario_project["name"])
+            if not m:
+                continue
+            project_year = int(m.group("year"))
+
+            if scenario_project["citation"]:
+                report_link = self.get_hyperlinks_from_text(
+                    scenario_project["citation"], report_url_pattern
+                ).pop()
+            elif project_year == 2021:
+                report_link = REPORT_2021
+            m = report_url_pattern.search(report_link)
+            if not m:
+                raise AssertionError(
+                    f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:"
+                    f"{scenario_project}"
+                )
+            download_links = {f"{m.group('fy')}_{m.group('number')}": report_link}
+            file_list = post_to_json(
+                "https://scenarioviewer.nrel.gov/api/file-list/",
+                project_uuid=project_uuid,
+            )
+            for file_record in (
+                f for f in file_list["files"] if f["file_type"] == "CSV"
+            ):
+                file_resp = await retry_async(
+                    self.session.post,
+                    ["https://scenarioviewer.nrel.gov/api/download/"],
+                    data={"project_uuid": project_uuid, "file_ids": file_record["id"]},
+                )
+                file_headers = file_resp.headers()
+                download_filename = f"{file_record['location_type']}.csv"
+
+                m = filename_pattern.search(file_headers["Location"])
+                if m:
+                    download_filename = m.groups(1)
+                else:
+                    # this will give us e.g.
+                    # (for 2023-2024) "ALL Transmission Capacities.csv" "ALL States.csv"
+                    # (for previous years) "Electrification Nations.csv" "High Natural Gas Prices States.csv"
+                    download_filename = (
+                        f"{file_record['scenario']} {file_record['location_type']}.csv"
+                    )
+
+                download_links[download_filename] = file_headers["Location"]
+            yield self.get_year_resource(download_links, project_year)
+
+    async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceInfo:
+        """Download all available data for a year.
+
+        Resulting resource contains one pdf of the scenario report, and a set of CSVs for different scenarios and geo levels.
+
+        Args:
+            links: filename->URL mapping for files to download
+            year: the year we're downloading data for
+        """
+        zip_path = self.download_directory / f"{self.name}-{year}.zip"
+        data_paths_in_archive = set()
+        for filename, link in sorted(links.items()):
+            self.logger.info(f"Downloading {filename} from {link}")
+            download_path = self.download_directory / filename
+            await self.download_file(link, download_path)
+            self.add_to_archive(
+                zip_path=zip_path,
+                filename=filename,
+                blob=download_path.open("rb"),
+            )
+            data_paths_in_archive.add(filename)
+            # Don't want to leave multiple giant files on disk, so delete
+            # immediately after they're safely stored in the ZIP
+            download_path.unlink()
+        return ResourceInfo(
+            local_path=zip_path,
+            partitions={"years": year},
+            layout=ZipLayout(file_paths=data_paths_in_archive),
+        )

From 24844d4792ded01aed5dc2038af6d3b61514933d Mon Sep 17 00:00:00 2001
From: Kathryn Mazaitis <kathryn.mazaitis@catalyst.coop>
Date: Fri, 31 Jan 2025 15:04:55 -0500
Subject: [PATCH 04/10] [wip] getting working emergency

---
 src/pudl_archiver/archivers/nrelss.py | 26 +++++++++++++++++++-------
 src/pudl_archiver/metadata/sources.py |  4 ++--
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py
index dcd3b3f1..5c3f48a8 100644
--- a/src/pudl_archiver/archivers/nrelss.py
+++ b/src/pudl_archiver/archivers/nrelss.py
@@ -25,16 +25,16 @@ async def get_resources(self) -> ArchiveAwaitable:
         """Download NREL Standard Scenarios resources."""
 
         async def post_to_json(url, **kwargs):
-            resp = await retry_async(self.session.post, [url], data=kwargs)
+            resp = await retry_async(self.session.post, [url], kwargs={"data":kwargs})
             return await retry_async(resp.json)
 
         project_year_pattern = re.compile(r"Standard Scenarios (?P<year>\d{4})")
         report_url_pattern = re.compile(
-            r"http://www.nrel.gov/docs/(?P<fy>fy\d{2}osti)/(?P<number>\d{5}\.pdf)"
+            r"https://www.nrel.gov/docs/(?P<fy>fy\d{2}osti)/(?P<number>\d{5}\.pdf)"
         )
         filename_pattern = re.compile(r"/([^/?]*/.csv)")
 
-        project_records = self.get_json("https://scenarioviewer.nrel.gov/api/projects/")
+        project_records = await self.get_json("https://scenarioviewer.nrel.gov/api/projects/")
         for scenario_project in (
             p for p in project_records if p["name"].startswith("Standard Scenarios")
         ):
@@ -47,7 +47,14 @@ async def post_to_json(url, **kwargs):
             if scenario_project["citation"]:
                 report_link = self.get_hyperlinks_from_text(
                     scenario_project["citation"], report_url_pattern
-                ).pop()
+                )
+                if report_link:
+                    report_link = report_link.pop()
+                else:
+                    raise AssertionError(
+                        f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:"
+                        f"{scenario_project}"
+                    )
             elif project_year == 2021:
                 report_link = REPORT_2021
             m = report_url_pattern.search(report_link)
@@ -57,7 +64,7 @@ async def post_to_json(url, **kwargs):
                     f"{scenario_project}"
                 )
             download_links = {f"{m.group('fy')}_{m.group('number')}": report_link}
-            file_list = post_to_json(
+            file_list = await post_to_json(
                 "https://scenarioviewer.nrel.gov/api/file-list/",
                 project_uuid=project_uuid,
             )
@@ -67,11 +74,16 @@ async def post_to_json(url, **kwargs):
                 file_resp = await retry_async(
                     self.session.post,
                     ["https://scenarioviewer.nrel.gov/api/download/"],
-                    data={"project_uuid": project_uuid, "file_ids": file_record["id"]},
+                    kwargs={
+                        "data":{"project_uuid": project_uuid, "file_ids": file_record["id"]},
+                        "kwargs":{"allow_redirects":False}},
                 )
-                file_headers = file_resp.headers()
+                file_headers = file_resp.headers
                 download_filename = f"{file_record['location_type']}.csv"
 
+                if "Location" not in file_headers:
+                    for h in file_headers:
+                        print(f"{h}: {file_headers[h]}")
                 m = filename_pattern.search(file_headers["Location"])
                 if m:
                     download_filename = m.groups(1)
diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py
index 650bcb4c..10ee33e5 100644
--- a/src/pudl_archiver/metadata/sources.py
+++ b/src/pudl_archiver/metadata/sources.py
@@ -2,7 +2,7 @@
 
 from typing import Any
 
-from pudl.metadata.constants import CONTRIBUTORS, LICENSES
+from pudl.metadata.constants import CONTRIBUTORS, LICENSES, KEYWORDS
 
 # To add a new contributor, follow the following format to add an entry to the
 # ADDL_CONTRIBUTORS dictionary below formatted like this:
@@ -447,7 +447,7 @@
             {
                 "nrel",
                 "standard scenarios",
-            }
+            } #+ KEYWORDS["us_govt"] + KEYWORDS["electricity"]
         ),
         "license_raw": LICENSES["cc-by-4.0"],
         "license_pudl": LICENSES["cc-by-4.0"],

From e08bb8d7d2342d0486e428d24f4dc51d8aeda63e Mon Sep 17 00:00:00 2001
From: Kathryn Mazaitis <kathryn.mazaitis@catalyst.coop>
Date: Fri, 31 Jan 2025 15:30:00 -0500
Subject: [PATCH 05/10] running

---
 src/pudl_archiver/archivers/nrelss.py | 113 ++++++++++++++++++--------
 1 file changed, 79 insertions(+), 34 deletions(-)

diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py
index 5c3f48a8..df85ea1e 100644
--- a/src/pudl_archiver/archivers/nrelss.py
+++ b/src/pudl_archiver/archivers/nrelss.py
@@ -1,5 +1,9 @@
 """Download NREL Standard Scenarios data."""
 
+import aiohttp
+from contextlib import nullcontext
+import io
+from pathlib import Path
 import re
 
 from pudl_archiver.archivers.classes import (
@@ -15,6 +19,13 @@
 # able to hard-code it for now:
 REPORT_2021 = "https://www.nrel.gov/docs/fy22osti/80641.pdf"
 
+async def _download_file_post(
+    session: aiohttp.ClientSession, url: str, file: Path | io.BytesIO, **kwargs
+):
+    async with session.post(url, **kwargs) as response:
+        with file.open("wb") if isinstance(file, Path) else nullcontext(file) as f:
+            async for chunk in response.content.iter_chunked(1024):
+                f.write(chunk)
 
 class NrelStandardScenariosArchiver(AbstractDatasetArchiver):
     """NREL Standard Scenarios archiver."""
@@ -63,42 +74,50 @@ async def post_to_json(url, **kwargs):
                     f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:"
                     f"{scenario_project}"
                 )
-            download_links = {f"{m.group('fy')}_{m.group('number')}": report_link}
+                    
             file_list = await post_to_json(
                 "https://scenarioviewer.nrel.gov/api/file-list/",
                 project_uuid=project_uuid,
             )
-            for file_record in (
-                f for f in file_list["files"] if f["file_type"] == "CSV"
-            ):
-                file_resp = await retry_async(
-                    self.session.post,
-                    ["https://scenarioviewer.nrel.gov/api/download/"],
-                    kwargs={
-                        "data":{"project_uuid": project_uuid, "file_ids": file_record["id"]},
-                        "kwargs":{"allow_redirects":False}},
-                )
-                file_headers = file_resp.headers
-                download_filename = f"{file_record['location_type']}.csv"
-
-                if "Location" not in file_headers:
-                    for h in file_headers:
-                        print(f"{h}: {file_headers[h]}")
-                m = filename_pattern.search(file_headers["Location"])
-                if m:
-                    download_filename = m.groups(1)
-                else:
-                    # this will give us e.g.
-                    # (for 2023-2024) "ALL Transmission Capacities.csv" "ALL States.csv"
-                    # (for previous years) "Electrification Nations.csv" "High Natural Gas Prices States.csv"
-                    download_filename = (
-                        f"{file_record['scenario']} {file_record['location_type']}.csv"
-                    )
-
-                download_links[download_filename] = file_headers["Location"]
-            yield self.get_year_resource(download_links, project_year)
+            # for file_record in (
+#                 
+#             ):
+#                 file_resp = await retry_async(
+#                     self.session.post,
+#                     ["https://scenarioviewer.nrel.gov/api/download/"],
+#                     kwargs={
+#                         "data":{"project_uuid": project_uuid, "file_ids": file_record["id"]},
+#                         "kwargs":{"allow_redirects":False}},
+#                 )
+#                 file_headers = file_resp.headers
+#                 download_filename = f"{file_record['location_type']}.csv"
+# 
+#                 if "Location" not in file_headers:
+#                     for h in file_headers:
+#                         print(f"{h}: {file_headers[h]}")
+#                 m = filename_pattern.search(file_headers["Location"])
+#                 if m:
+#                     download_filename = m.groups(1)
+#                 else:
+#                     # this will give us e.g.
+#                     # (for 2023-2024) "ALL Transmission Capacities.csv" "ALL States.csv"
+#                     # (for previous years) "Electrification Nations.csv" "High Natural Gas Prices States.csv"
+#                     download_filename = (
+#                         f"{file_record['scenario']} {file_record['location_type']}.csv"
+#                     )
+# 
+#                 download_links[download_filename] = file_headers["Location"]
+            yield self.get_year_resource(
+                report=(f"{m.group('fy')}_{m.group('number')}", report_link),
+                uuid=project_uuid,
+                file_ids=[
+                    (f["id"], f"NRELSS {project_year}  {f['scenario']}  {f['location_type']}.csv".replace(" ","_"))
+                    for f in file_list["files"] if f["file_type"] == "CSV"
+                ],
+                year=project_year
+            )
 
-    async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceInfo:
+    async def get_year_resource(self, report, uuid, file_ids, year: int) -> ResourceInfo:
         """Download all available data for a year.
 
         Resulting resource contains one pdf of the scenario report, and a set of CSVs for different scenarios and geo levels.
@@ -109,10 +128,36 @@ async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceI
         """
         zip_path = self.download_directory / f"{self.name}-{year}.zip"
         data_paths_in_archive = set()
-        for filename, link in sorted(links.items()):
-            self.logger.info(f"Downloading {filename} from {link}")
+        # report
+        self.logger.info(f"Downloading report {report[0]} from {report[1]}")
+        download_path = self.download_directory / report[0]
+        await self.download_file(report[1], download_path)
+        self.add_to_archive(
+            zip_path=zip_path,
+            filename=report[0],
+            blob=download_path.open("rb"),
+        )
+        data_paths_in_archive.add(report[0])
+        # Don't want to leave multiple giant files on disk, so delete
+        # immediately after they're safely stored in the ZIP
+        download_path.unlink()
+        
+        for file_id,filename in file_ids:
+            self.logger.info(f"Downloading file {file_id} {uuid}")
+#             file_resp = await retry_async(
+#                 self.session.post,
+#                 ["https://scenarioviewer.nrel.gov/api/download/"],
+#                 kwargs={
+#                     "data":{"project_uuid": project_uuid, "file_ids": file_record["id"]},
+#                     "kwargs":{"allow_redirects":False}},
+#             )
             download_path = self.download_directory / filename
-            await self.download_file(link, download_path)
+            await retry_async(
+                _download_file_post, 
+                [self.session, "https://scenarioviewer.nrel.gov/api/download/", download_path],
+                kwargs={"data":{"project_uuid": uuid, "file_ids": file_id}}
+            )
+#             await self.download_file(link, download_path)
             self.add_to_archive(
                 zip_path=zip_path,
                 filename=filename,

From 49a2974ae0d57edfce4ae71de072753e5d97535b Mon Sep 17 00:00:00 2001
From: Kathryn Mazaitis <kathryn.mazaitis@catalyst.coop>
Date: Fri, 31 Jan 2025 15:38:27 -0500
Subject: [PATCH 06/10] fix 2020

---
 src/pudl_archiver/archivers/nrelss.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py
index df85ea1e..176066b7 100644
--- a/src/pudl_archiver/archivers/nrelss.py
+++ b/src/pudl_archiver/archivers/nrelss.py
@@ -112,7 +112,7 @@ async def post_to_json(url, **kwargs):
                 uuid=project_uuid,
                 file_ids=[
                     (f["id"], f"NRELSS {project_year}  {f['scenario']}  {f['location_type']}.csv".replace(" ","_"))
-                    for f in file_list["files"] if f["file_type"] == "CSV"
+                    for f in file_list["files"] if (f["file_type"] == "CSV" or project_year == 2020)
                 ],
                 year=project_year
             )
@@ -129,7 +129,7 @@ async def get_year_resource(self, report, uuid, file_ids, year: int) -> Resource
         zip_path = self.download_directory / f"{self.name}-{year}.zip"
         data_paths_in_archive = set()
         # report
-        self.logger.info(f"Downloading report {report[0]} from {report[1]}")
+        self.logger.info(f"Downloading report {year} {report[0]} from {report[1]}")
         download_path = self.download_directory / report[0]
         await self.download_file(report[1], download_path)
         self.add_to_archive(
@@ -143,7 +143,7 @@ async def get_year_resource(self, report, uuid, file_ids, year: int) -> Resource
         download_path.unlink()
         
         for file_id,filename in file_ids:
-            self.logger.info(f"Downloading file {file_id} {uuid}")
+            self.logger.info(f"Downloading file {year} {file_id} {uuid}")
 #             file_resp = await retry_async(
 #                 self.session.post,
 #                 ["https://scenarioviewer.nrel.gov/api/download/"],

From 7e7b2113a7fe5748571f77bd9a8a32c23659b78e Mon Sep 17 00:00:00 2001
From: Kathryn Mazaitis <kathryn.mazaitis@catalyst.coop>
Date: Fri, 31 Jan 2025 15:56:59 -0500
Subject: [PATCH 07/10] fixed bad file extension

---
 src/pudl_archiver/archivers/nrelss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py
index 176066b7..b0cc2eca 100644
--- a/src/pudl_archiver/archivers/nrelss.py
+++ b/src/pudl_archiver/archivers/nrelss.py
@@ -111,7 +111,7 @@ async def post_to_json(url, **kwargs):
                 report=(f"{m.group('fy')}_{m.group('number')}", report_link),
                 uuid=project_uuid,
                 file_ids=[
-                    (f["id"], f"NRELSS {project_year}  {f['scenario']}  {f['location_type']}.csv".replace(" ","_"))
+                    (f["id"], f"NRELSS {project_year}  {f['scenario']}  {f['location_type']}.{f['file_type']}".replace(" ","_").lower())
                     for f in file_list["files"] if (f["file_type"] == "CSV" or project_year == 2020)
                 ],
                 year=project_year

From 0709e376c6240f8dfdd72db3dae80e4f5ed33089 Mon Sep 17 00:00:00 2001
From: Kathryn Mazaitis <kathryn.mazaitis@catalyst.coop>
Date: Fri, 31 Jan 2025 16:26:54 -0500
Subject: [PATCH 08/10] fix troublesome filename characters

---
 src/pudl_archiver/archivers/nrelss.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py
index b0cc2eca..7333bbcf 100644
--- a/src/pudl_archiver/archivers/nrelss.py
+++ b/src/pudl_archiver/archivers/nrelss.py
@@ -111,7 +111,10 @@ async def post_to_json(url, **kwargs):
                 report=(f"{m.group('fy')}_{m.group('number')}", report_link),
                 uuid=project_uuid,
                 file_ids=[
-                    (f["id"], f"NRELSS {project_year}  {f['scenario']}  {f['location_type']}.{f['file_type']}".replace(" ","_").lower())
+                    (
+                        f["id"], 
+                        f"NRELSS {project_year}  {f['scenario']}  {f['location_type']}.{f['file_type']}".replace(" ","_").replace("%","pct").replace(",","").lower()
+                    )
                     for f in file_list["files"] if (f["file_type"] == "CSV" or project_year == 2020)
                 ],
                 year=project_year

From 3b9e2d219f99bea351084acfe0be89462a7fab8d Mon Sep 17 00:00:00 2001
From: Kathryn Mazaitis <kathryn.mazaitis@catalyst.coop>
Date: Fri, 31 Jan 2025 16:43:17 -0500
Subject: [PATCH 09/10] successful run

---
 src/pudl_archiver/archivers/nrelss.py | 38 +--------------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py
index 7333bbcf..1800bc28 100644
--- a/src/pudl_archiver/archivers/nrelss.py
+++ b/src/pudl_archiver/archivers/nrelss.py
@@ -79,34 +79,6 @@ async def post_to_json(url, **kwargs):
                 "https://scenarioviewer.nrel.gov/api/file-list/",
                 project_uuid=project_uuid,
             )
-            # for file_record in (
-#                 
-#             ):
-#                 file_resp = await retry_async(
-#                     self.session.post,
-#                     ["https://scenarioviewer.nrel.gov/api/download/"],
-#                     kwargs={
-#                         "data":{"project_uuid": project_uuid, "file_ids": file_record["id"]},
-#                         "kwargs":{"allow_redirects":False}},
-#                 )
-#                 file_headers = file_resp.headers
-#                 download_filename = f"{file_record['location_type']}.csv"
-# 
-#                 if "Location" not in file_headers:
-#                     for h in file_headers:
-#                         print(f"{h}: {file_headers[h]}")
-#                 m = filename_pattern.search(file_headers["Location"])
-#                 if m:
-#                     download_filename = m.groups(1)
-#                 else:
-#                     # this will give us e.g.
-#                     # (for 2023-2024) "ALL Transmission Capacities.csv" "ALL States.csv"
-#                     # (for previous years) "Electrification Nations.csv" "High Natural Gas Prices States.csv"
-#                     download_filename = (
-#                         f"{file_record['scenario']} {file_record['location_type']}.csv"
-#                     )
-# 
-#                 download_links[download_filename] = file_headers["Location"]
             yield self.get_year_resource(
                 report=(f"{m.group('fy')}_{m.group('number')}", report_link),
                 uuid=project_uuid,
@@ -147,20 +119,12 @@ async def get_year_resource(self, report, uuid, file_ids, year: int) -> Resource
         
         for file_id,filename in file_ids:
             self.logger.info(f"Downloading file {year} {file_id} {uuid}")
-#             file_resp = await retry_async(
-#                 self.session.post,
-#                 ["https://scenarioviewer.nrel.gov/api/download/"],
-#                 kwargs={
-#                     "data":{"project_uuid": project_uuid, "file_ids": file_record["id"]},
-#                     "kwargs":{"allow_redirects":False}},
-#             )
             download_path = self.download_directory / filename
             await retry_async(
                 _download_file_post, 
                 [self.session, "https://scenarioviewer.nrel.gov/api/download/", download_path],
                 kwargs={"data":{"project_uuid": uuid, "file_ids": file_id}}
             )
-#             await self.download_file(link, download_path)
             self.add_to_archive(
                 zip_path=zip_path,
                 filename=filename,
@@ -173,5 +137,5 @@ async def get_year_resource(self, report, uuid, file_ids, year: int) -> Resource
         return ResourceInfo(
             local_path=zip_path,
             partitions={"years": year},
-            layout=ZipLayout(file_paths=data_paths_in_archive),
+            #layout=ZipLayout(file_paths=data_paths_in_archive), # can't use ZipLayout bc these CSVs have a multi-row header and pandas throws a tantrum
         )

From d8e3d2f42b541907a75578ec5c72ba48c711b7df Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 31 Jan 2025 21:43:55 +0000
Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

For more information, see https://pre-commit.ci
---
 src/pudl_archiver/archivers/nrelss.py | 52 +++++++++++++++++----------
 src/pudl_archiver/metadata/sources.py |  4 +--
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py
index 1800bc28..4b24a7da 100644
--- a/src/pudl_archiver/archivers/nrelss.py
+++ b/src/pudl_archiver/archivers/nrelss.py
@@ -1,17 +1,17 @@
 """Download NREL Standard Scenarios data."""
 
-import aiohttp
-from contextlib import nullcontext
 import io
-from pathlib import Path
 import re
+from contextlib import nullcontext
+from pathlib import Path
+
+import aiohttp
 
 from pudl_archiver.archivers.classes import (
     AbstractDatasetArchiver,
     ArchiveAwaitable,
     ResourceInfo,
 )
-from pudl_archiver.frictionless import ZipLayout
 from pudl_archiver.utils import retry_async
 
 # The citation field for Standard Scenarios 2021 is blank, but they linked to the
@@ -19,6 +19,7 @@
 # able to hard-code it for now:
 REPORT_2021 = "https://www.nrel.gov/docs/fy22osti/80641.pdf"
 
+
 async def _download_file_post(
     session: aiohttp.ClientSession, url: str, file: Path | io.BytesIO, **kwargs
 ):
@@ -27,6 +28,7 @@ async def _download_file_post(
             async for chunk in response.content.iter_chunked(1024):
                 f.write(chunk)
 
+
 class NrelStandardScenariosArchiver(AbstractDatasetArchiver):
     """NREL Standard Scenarios archiver."""
 
@@ -36,7 +38,7 @@ async def get_resources(self) -> ArchiveAwaitable:
         """Download NREL Standard Scenarios resources."""
 
         async def post_to_json(url, **kwargs):
-            resp = await retry_async(self.session.post, [url], kwargs={"data":kwargs})
+            resp = await retry_async(self.session.post, [url], kwargs={"data": kwargs})
             return await retry_async(resp.json)
 
         project_year_pattern = re.compile(r"Standard Scenarios (?P<year>\d{4})")
@@ -45,7 +47,9 @@ async def post_to_json(url, **kwargs):
         )
         filename_pattern = re.compile(r"/([^/?]*/.csv)")
 
-        project_records = await self.get_json("https://scenarioviewer.nrel.gov/api/projects/")
+        project_records = await self.get_json(
+            "https://scenarioviewer.nrel.gov/api/projects/"
+        )
         for scenario_project in (
             p for p in project_records if p["name"].startswith("Standard Scenarios")
         ):
@@ -74,7 +78,7 @@ async def post_to_json(url, **kwargs):
                     f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:"
                     f"{scenario_project}"
                 )
-                    
+
             file_list = await post_to_json(
                 "https://scenarioviewer.nrel.gov/api/file-list/",
                 project_uuid=project_uuid,
@@ -84,15 +88,23 @@ async def post_to_json(url, **kwargs):
                 uuid=project_uuid,
                 file_ids=[
                     (
-                        f["id"], 
-                        f"NRELSS {project_year}  {f['scenario']}  {f['location_type']}.{f['file_type']}".replace(" ","_").replace("%","pct").replace(",","").lower()
+                        f["id"],
+                        f"NRELSS {project_year}  {f['scenario']}  {f['location_type']}.{f['file_type']}".replace(
+                            " ", "_"
+                        )
+                        .replace("%", "pct")
+                        .replace(",", "")
+                        .lower(),
                     )
-                    for f in file_list["files"] if (f["file_type"] == "CSV" or project_year == 2020)
+                    for f in file_list["files"]
+                    if (f["file_type"] == "CSV" or project_year == 2020)
                 ],
-                year=project_year
+                year=project_year,
             )
 
-    async def get_year_resource(self, report, uuid, file_ids, year: int) -> ResourceInfo:
+    async def get_year_resource(
+        self, report, uuid, file_ids, year: int
+    ) -> ResourceInfo:
         """Download all available data for a year.
 
         Resulting resource contains one pdf of the scenario report, and a set of CSVs for different scenarios and geo levels.
@@ -116,14 +128,18 @@ async def get_year_resource(self, report, uuid, file_ids, year: int) -> Resource
         # Don't want to leave multiple giant files on disk, so delete
         # immediately after they're safely stored in the ZIP
         download_path.unlink()
-        
-        for file_id,filename in file_ids:
+
+        for file_id, filename in file_ids:
             self.logger.info(f"Downloading file {year} {file_id} {uuid}")
             download_path = self.download_directory / filename
             await retry_async(
-                _download_file_post, 
-                [self.session, "https://scenarioviewer.nrel.gov/api/download/", download_path],
-                kwargs={"data":{"project_uuid": uuid, "file_ids": file_id}}
+                _download_file_post,
+                [
+                    self.session,
+                    "https://scenarioviewer.nrel.gov/api/download/",
+                    download_path,
+                ],
+                kwargs={"data": {"project_uuid": uuid, "file_ids": file_id}},
             )
             self.add_to_archive(
                 zip_path=zip_path,
@@ -137,5 +153,5 @@ async def get_year_resource(self, report, uuid, file_ids, year: int) -> Resource
         return ResourceInfo(
             local_path=zip_path,
             partitions={"years": year},
-            #layout=ZipLayout(file_paths=data_paths_in_archive), # can't use ZipLayout bc these CSVs have a multi-row header and pandas throws a tantrum
+            # layout=ZipLayout(file_paths=data_paths_in_archive), # can't use ZipLayout bc these CSVs have a multi-row header and pandas throws a tantrum
         )
diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py
index 10ee33e5..cae8609b 100644
--- a/src/pudl_archiver/metadata/sources.py
+++ b/src/pudl_archiver/metadata/sources.py
@@ -2,7 +2,7 @@
 
 from typing import Any
 
-from pudl.metadata.constants import CONTRIBUTORS, LICENSES, KEYWORDS
+from pudl.metadata.constants import CONTRIBUTORS, LICENSES
 
 # To add a new contributor, follow the following format to add an entry to the
 # ADDL_CONTRIBUTORS dictionary below formatted like this:
@@ -447,7 +447,7 @@
             {
                 "nrel",
                 "standard scenarios",
-            } #+ KEYWORDS["us_govt"] + KEYWORDS["electricity"]
+            }  # + KEYWORDS["us_govt"] + KEYWORDS["electricity"]
         ),
         "license_raw": LICENSES["cc-by-4.0"],
         "license_pudl": LICENSES["cc-by-4.0"],