catalyst-cooperative · krivard · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025 · Jan 31, 2025
diff --git a/src/pudl_archiver/archivers/classes.py b/src/pudl_archiver/archivers/classes.py
@@ -260,7 +260,6 @@ async def get_hyperlinks(
             headers: Additional headers to send in the GET request.
         """
         # Parse web page to get all hyperlinks
-        parser = _HyperlinkExtractor()
 
         response = await retry_async(
             self.session.get,
@@ -271,6 +270,25 @@ async def get_hyperlinks(
             },
         )
         text = await retry_async(response.text)
+        return self.get_hyperlinks_from_text(text, filter_pattern)
+
+    def get_hyperlinks_from_text(
+        self,
+        text: str,
+        filter_pattern: typing.Pattern | None = None,
+    ) -> list[str]:
+        """Return all hyperlinks from HTML text.
+
+        This is a helper-helper function to perform very basic HTML-parsing functionality.
+        It extracts all hyperlinks from an HTML text, and returns those that match
+        a specified pattern. This means it can find all hyperlinks that look like
+        a download link to a single data resource.
+
+        Args:
+            text: text containing HTML.
+            filter_pattern: If present, only return links that contain pattern.
+        """
+        parser = _HyperlinkExtractor()
         parser.feed(text)
 
         # Filter to those that match filter_pattern
@@ -282,7 +300,7 @@ async def get_hyperlinks(
         if not hyperlinks:
             self.logger.warning(
                 f"The archiver couldn't find any hyperlinks{('that match: ' + filter_pattern.pattern) if filter_pattern else ''}."
-                f"Make sure your filter_pattern is correct, check if the structure of the {url} page changed, or if you are missing HTTP headers."
+                f"Make sure your filter_pattern is correct, and check if the structure of the page is not what you expect it to be."
             )
 
         return hyperlinks

diff --git a/src/pudl_archiver/archivers/nrelss.py b/src/pudl_archiver/archivers/nrelss.py
@@ -0,0 +1,157 @@
+"""Download NREL Standard Scenarios data."""
+
+import io
+import re
+from contextlib import nullcontext
+from pathlib import Path
+
+import aiohttp
+
+from pudl_archiver.archivers.classes import (
+    AbstractDatasetArchiver,
+    ArchiveAwaitable,
+    ResourceInfo,
+)
+from pudl_archiver.utils import retry_async
+
+# The citation field for Standard Scenarios 2021 is blank, but they linked to the
+# 2021 report from the description of one of the other available projects, so we're
+# able to hard-code it for now:
+REPORT_2021 = "https://www.nrel.gov/docs/fy22osti/80641.pdf"
+
+
+async def _download_file_post(
+    session: aiohttp.ClientSession, url: str, file: Path | io.BytesIO, **kwargs
+):
+    async with session.post(url, **kwargs) as response:
+        with file.open("wb") if isinstance(file, Path) else nullcontext(file) as f:
+            async for chunk in response.content.iter_chunked(1024):
+                f.write(chunk)
+
+
+class NrelStandardScenariosArchiver(AbstractDatasetArchiver):
+    """NREL Standard Scenarios archiver."""
+
+    name = "nrelss"
+
+    async def get_resources(self) -> ArchiveAwaitable:
+        """Download NREL Standard Scenarios resources."""
+
+        async def post_to_json(url, **kwargs):
+            resp = await retry_async(self.session.post, [url], kwargs={"data": kwargs})
+            return await retry_async(resp.json)
+
+        project_year_pattern = re.compile(r"Standard Scenarios (?P<year>\d{4})")
+        report_url_pattern = re.compile(
+            r"https://www.nrel.gov/docs/(?P<fy>fy\d{2}osti)/(?P<number>\d{5}\.pdf)"
+        )
+        filename_pattern = re.compile(r"/([^/?]*/.csv)")
+
+        project_records = await self.get_json(
+            "https://scenarioviewer.nrel.gov/api/projects/"
+        )
+        for scenario_project in (
+            p for p in project_records if p["name"].startswith("Standard Scenarios")
+        ):
+            project_uuid = scenario_project["uuid"]
+            m = project_year_pattern.search(scenario_project["name"])
+            if not m:
+                continue
+            project_year = int(m.group("year"))
+
+            if scenario_project["citation"]:
+                report_link = self.get_hyperlinks_from_text(
+                    scenario_project["citation"], report_url_pattern
+                )
+                if report_link:
+                    report_link = report_link.pop()
+                else:
+                    raise AssertionError(
+                        f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:"
+                        f"{scenario_project}"
+                    )
+            elif project_year == 2021:
+                report_link = REPORT_2021
+            m = report_url_pattern.search(report_link)
+            if not m:
+                raise AssertionError(
+                    f"We expect all years except 2021 to have a citation with a link to the report, but {project_year} does not:"
+                    f"{scenario_project}"
+                )
+
+            file_list = await post_to_json(
+                "https://scenarioviewer.nrel.gov/api/file-list/",
+                project_uuid=project_uuid,
+            )
+            yield self.get_year_resource(
+                report=(f"{m.group('fy')}_{m.group('number')}", report_link),
+                uuid=project_uuid,
+                file_ids=[
+                    (
+                        f["id"],
+                        f"NRELSS {project_year}  {f['scenario']}  {f['location_type']}.{f['file_type']}".replace(
+                            " ", "_"
+                        )
+                        .replace("%", "pct")
+                        .replace(",", "")
+                        .lower(),
+                    )
+                    for f in file_list["files"]
+                    if (f["file_type"] == "CSV" or project_year == 2020)
+                ],
+                year=project_year,
+            )
+
+    async def get_year_resource(
+        self, report, uuid, file_ids, year: int
+    ) -> ResourceInfo:
+        """Download all available data for a year.
+
+        Resulting resource contains one pdf of the scenario report, and a set of CSVs for different scenarios and geo levels.
+
+        Args:
+            links: filename->URL mapping for files to download
+            year: the year we're downloading data for
+        """
+        zip_path = self.download_directory / f"{self.name}-{year}.zip"
+        data_paths_in_archive = set()
+        # report
+        self.logger.info(f"Downloading report {year} {report[0]} from {report[1]}")
+        download_path = self.download_directory / report[0]
+        await self.download_file(report[1], download_path)
+        self.add_to_archive(
+            zip_path=zip_path,
+            filename=report[0],
+            blob=download_path.open("rb"),
+        )
+        data_paths_in_archive.add(report[0])
+        # Don't want to leave multiple giant files on disk, so delete
+        # immediately after they're safely stored in the ZIP
+        download_path.unlink()
+
+        for file_id, filename in file_ids:
+            self.logger.info(f"Downloading file {year} {file_id} {uuid}")
+            download_path = self.download_directory / filename
+            await retry_async(
+                _download_file_post,
+                [
+                    self.session,
+                    "https://scenarioviewer.nrel.gov/api/download/",
+                    download_path,
+                ],
+                kwargs={"data": {"project_uuid": uuid, "file_ids": file_id}},
+            )
+            self.add_to_archive(
+                zip_path=zip_path,
+                filename=filename,
+                blob=download_path.open("rb"),
+            )
+            data_paths_in_archive.add(filename)
+            # Don't want to leave multiple giant files on disk, so delete
+            # immediately after they're safely stored in the ZIP
+            download_path.unlink()
+        return ResourceInfo(
+            local_path=zip_path,
+            partitions={"years": year},
+            # layout=ZipLayout(file_paths=data_paths_in_archive), # can't use ZipLayout bc these CSVs have a multi-row header and pandas throws a tantrum
+        )
diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py
@@ -416,4 +416,40 @@
         "license_pudl": LICENSES["cc-by-4.0"],
         "contributors": [CONTRIBUTORS["catalyst-cooperative"]],
     },
+    "nrelss": {
+        "title": "NREL Standard Scenarios",
+        "path": "https://www.nrel.gov/analysis/standard-scenarios.html",
+        "description": (
+            "NREL's Standard Scenarios are a suite of forward-looking scenarios of the U.S."
+            "power sector that are updated annually to support and inform energy analysis."
+            "The Standard Scenarios are simulated using the Regional Energy Deployment System"
+            "and Distributed Generation Market Demand Model capacity expansion models and are"
+            "updated each year to provide timely information regarding power sector evolution."
+            "The scenarios have been designed to capture a range of possible power system"
+            "futures and consider a variety of factors from high vehicle electrification to"
+            "major cost declines for electricity generation technologies (e.g., using cost"
+            "inputs from the Annual Technology Baseline)."
+            "For select scenarios, the models are run using the PLEXOS software and the"
+            "Cambium tool that assembles structured data sets of hourly cost, emissions, and"
+            "operational data for modeled futures. Results are available using the Scenario"
+            "Viewer and Data Downloader."
+        ),
+        "source_file_dict": {
+            "source_format": "CSV",
+        },
+        "working_partitions": {
+            "years": list(range(2016, 2025)),
+        },
+        "contributors": [
+            CONTRIBUTORS["catalyst-cooperative"],
+        ],
+        "keywords": sorted(
+            {
+                "nrel",
+                "standard scenarios",
+            }  # + KEYWORDS["us_govt"] + KEYWORDS["electricity"]
+        ),
+        "license_raw": LICENSES["cc-by-4.0"],
+        "license_pudl": LICENSES["cc-by-4.0"],
+    },
 }