catalyst-cooperative · e-belfer · Jan 30, 2025 · Jan 30, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/src/pudl_archiver/archivers/nrelefs.py b/src/pudl_archiver/archivers/nrelefs.py
@@ -0,0 +1,205 @@
+"""Download NREL Electrification Futures Study data."""
+
+import re
+
+from pudl_archiver.archivers.classes import (
+    AbstractDatasetArchiver,
+    ArchiveAwaitable,
+    ResourceInfo,
+)
+
+# Main page
+# https://www.nrel.gov/analysis/electrification-futures.html
+
+# Grab all data sites with the following formats
+# https://data.nrel.gov/submissions/90
+# https://data.openei.org/submissions/4130
+
+# Also grab all PDFs on the main page
+BASE_URL = "https://www.nrel.gov/analysis/electrification-futures.html"
+
+
+class NrelEFSArchiver(AbstractDatasetArchiver):
+    """NREL Electrification Futures Studies archiver."""
+
+    name = "nrelefs"
+
+    async def get_resources(self) -> ArchiveAwaitable:
+        """Download NREL EFS resources.
+
+        The main page links to a series of PDFs as well as data.nrel.gov and data.openei.org webpages
+        containing associated data for each report.
+        """
+        # Hard-code a dictionary of each version of the study, with a short-hand
+        # description of the report as the key and the links to all data and reports
+        # in the version as the values. This was last published in 2021 so we don't
+        # expect these to change.
+
+        version_dict = {
+            "cost-and-performance": [
+                "https://www.nrel.gov/docs/fy18osti/70485.pdf",
+                "https://data.nrel.gov/submissions/93",
+                "https://data.nrel.gov/submissions/78",
+            ],
+            "demand-side-scenarios": [
+                "https://www.nrel.gov/docs/fy18osti/71500.pdf",
+                "https://www.nrel.gov/docs/fy18osti/72096.pdf",
+                "https://www.nrel.gov/docs/fy18osti/72311.pdf",
+                "https://data.nrel.gov/submissions/90",
+                "https://data.nrel.gov/submissions/92",
+            ],
+            "dsgrid-model": [
+                "https://www.nrel.gov/docs/fy18osti/71492.pdf",
+                "https://www.nrel.gov/docs/fy18osti/72388.pdf",
+                "https://data.openei.org/submissions/4130",
+            ],
+            "load-profiles": [
+                "https://www.nrel.gov/docs/fy20osti/73336.pdf",
+                "https://data.nrel.gov/submissions/126",
+                "https://data.nrel.gov/submissions/127",
+            ],
+            "supply-side-scenarios": [
+                "https://www.nrel.gov/docs/fy21osti/72330.pdf",
+                "https://www.nrel.gov/docs/fy21osti/78783.pdf",
+                "https://data.nrel.gov/submissions/157",
+            ],
+            "detailed-grid-simulations": [
+                "https://www.nrel.gov/docs/fy21osti/79094.pdf",
+                "https://www.nrel.gov/docs/fy21osti/80167.pdf",
+            ],
+        }
+
+        # Though we hardcode the links above, we also grab the PDFs links from the page
+        # in order to get information about the name ascribed to the link to make it
+        # easier to label each PDF something informative
+        pdf_pattern = re.compile(r"\/docs\/fy(\d{2})osti\/\w*.pdf")
+        pdf_links = await self.get_hyperlinks(BASE_URL, pdf_pattern)
+
+        # For each version, yield a method that will produce one zipfile containing
+        # all the files for the method
+        for version, links in version_dict.items():
+            yield self.get_version_resource(
+                version=version, links=links, pdf_links=pdf_links
+            )
+
+    async def get_version_resource(
+        self,
+        version: str,
+        links: list[str],
+        pdf_links: list[dict[str, str]],
+    ) -> ResourceInfo:
+        """Download all available data for a given version of an EFS study.
+
+        Resulting resource contains one zip file of all PDFs, .zip, .xlsx, .gzip, .csv.gzip
+        for a given version of the EFS studies. We handle the DS Grid specially because
+        the file
+
+        Args:
+            version: shorthand name for the given version
+            links: a list of links that contain data for this version.
+            pdf_links: a list of all PDF links found on the EFS homepage, with the title
+                of the link. We use this to rename the PDFs to something more informative
+                than the original file title.
+        """
+        # Set up zipfile name and list of files in zip
+        zipfile_path = self.download_directory / f"nrelefs-{version}.zip"
+        data_paths_in_archive = set()
+
+        # Compile pattern for all datasets on data.nrel.gov
+        data_pattern = re.compile(
+            r"files\/([\w\/]*)\/([\w \-%]*)(.zip|.xlsx|.gzip|.csv.gzip)$"
+        )
+
+        # Compile dictionary and regex pattern for DSGrid special case
+        dsgrid_dict = {
+            "dsgrid-site-energy-state-hourly": "https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=dsgrid-2018-efs%2Fdsgrid_site_energy_state_hourly%2F",
+            "raw-complete": "https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=dsgrid-2018-efs%2Fraw_complete%2F",
+            "state-hourly-residuals": "https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=dsgrid-2018-efs%2Fstate_hourly_residuals%2F",
+        }
+        dsg_pattern = re.compile(r"[\w]*.dsg$")
+
+        for link in links:
+            # First, get all the PDFs
+            if link.endswith(".pdf"):
+                matching_pdf_link = [key for key in pdf_links if key in link]
+                # Get the corresponding filename from pdf_links
+                if matching_pdf_link:
+                    link_key = matching_pdf_link.pop()
+                    filename = pdf_links[link_key]  # TODO: Debug this
+                    # Clean the filename to name the PDF something more informative than
+                    # the link name
+                    self.logger.info(f"Downloading {link}")
+                    filename = (
+                        filename.lower()
+                        .replace("\n", "")
+                        .replace("electrification futures study:", "")
+                    )
+                    filename = re.sub(
+                        "[^a-zA-Z0-9 -]+", "", filename
+                    ).strip()  # Remove all non-word, digit space or - characters
+                    filename = re.sub(
+                        r"\s+", "-", filename
+                    )  # Replace 1+ space with a dash
+                    filename = f"nrelefs-{version}-{filename}.pdf"
+                    await self.download_add_to_archive_and_unlink(
+                        url=link, filename=filename, zip_path=zipfile_path
+                    )
+                    data_paths_in_archive.add(filename)
+                else:
+                    # Alert us to expected but missing PDF links.
+                    raise AssertionError(
+                        f"Expected PDF link {link} but this wasn't found in {BASE_URL}. Has the home page changed?"
+                    )
+
+            # Next, get all the data files from data.nrel.gov
+            elif "data.nrel.gov/submissions/" in link:
+                self.logger.info(f"Downloading data files from {link}.")
+                data_links = await self.get_hyperlinks(link, data_pattern)
+                for data_link, filename in data_links.items():
+                    matches = data_pattern.search(data_link)
+                    if not matches:
+                        continue
+                    # Grab file name and extension
+                    filename = matches.group(2)
+                    file_ext = matches.group(3)
+
+                    # Reformat filename
+                    filename = filename.lower().replace("_", "-").replace("%20", "-")
+                    filename = re.sub(
+                        "[^a-zA-Z0-9 -]+", "", filename
+                    ).strip()  # Remove all non-word, digit space or - characters
+                    filename = re.sub(r"[\s-]+", "-", filename)
+                    filename = re.sub(
+                        r"^efs-", "", filename
+                    )  # We add this back with an nrel header
+                    filename = f"nrelefs-{version}-{filename}{file_ext}"
+                    self.logger.info(
+                        f"Downloading {data_link} as {filename} to {zipfile_path}."
+                    )
+                    await self.download_add_to_archive_and_unlink(
+                        url=data_link, filename=filename, zip_path=zipfile_path
+                    )
+                    data_paths_in_archive.add(filename)
+
+            elif "data.openei.org" in link:  # Finally, handle DSGrid data
+                self.logger.info("Downloading DSGrid data files.")
+                # Iterate through each type of DSGrid data and download
+                for data_type, dsg_link in dsgrid_dict.items():
+                    dsg_file_links = await self.get_hyperlinks(dsg_link, dsg_pattern)
+                    for dsg_link, filename in dsg_file_links.items():
+                        filename = filename.replace("_", "-")
+                        filename = f"nrelesg-{data_type}-{filename}"
+                        await self.download_add_to_archive_and_unlink(
+                            url=dsg_link, filename=filename, zip_path=zipfile_path
+                        )
+                        data_paths_in_archive.add(filename)
+
+            else:
+                # Raise error for mysterious other links
+                raise AssertionError(f"Unexpected format for link {link} in {version}.")
+
+        return ResourceInfo(
+            local_path=zipfile_path,
+            partitions={"version": version},
+            # layout=ZipLayout(file_paths=data_paths_in_archive),
+        )
diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py
@@ -416,4 +416,27 @@
         "license_pudl": LICENSES["cc-by-4.0"],
         "contributors": [CONTRIBUTORS["catalyst-cooperative"]],
     },
+    "nrelefs": {
+        "title": "NREL EFS -- Electrification Futures Study",
+        "path": "https://www.nrel.gov/analysis/electrification-futures.html",
+        "description": (
+            "The Electrification Futures Study (EFS) is a multi-year study conducted by NREL "
+            "and its research partners—Electric Power Research Institute, Evolved Energy Research, "
+            "Lawrence Berkeley National Laboratory, Northern Arizona University, and Oak Ridge National "
+            "Laboratory. EFS used multiple analytic tools and models to develop and assess "
+            "electrification scenarios designed to quantify potential energy, economic, "
+            "and environmental impacts to the U.S. power system and broader economy. There are six reports "
+            "comprising the EFS, with the final report released in May 2021."
+        ),
+        "working_partitions": {
+            "report_number": set(range(1, 7)),
+            "document_type": ["data", "technical_report", "presentation"],
+        },
+        "keywords": sorted(
+            {"doe", "lead", "low income", "energy affordability", "energy burden"}
+        ),
+        "license_raw": LICENSES["us-govt"],
+        "license_pudl": LICENSES["cc-by-4.0"],
+        "contributors": [CONTRIBUTORS["catalyst-cooperative"]],
+    },
 }