catalyst-cooperative · e-belfer · Jan 28, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/src/pudl_archiver/archivers/epapcap.py b/src/pudl_archiver/archivers/epapcap.py
@@ -0,0 +1,74 @@
+"""Download EPA PCAP data."""
+
+import re
+from pathlib import Path
+
+from pudl_archiver.archivers.classes import (
+    AbstractDatasetArchiver,
+    ArchiveAwaitable,
+    ResourceInfo,
+)
+from pudl_archiver.frictionless import ZipLayout
+
+BASE_URL = (
+    "https://www.epa.gov/inflation-reduction-act/priority-climate-action-plan-directory"
+)
+DATA_TABLE_URLS = [
+    "https://www.epa.gov/inflation-reduction-act/ghg-inventory-searchable-table",
+    "https://www.epa.gov/inflation-reduction-act/ghg-reduction-measures-searchable-table",
+    "https://www.epa.gov/inflation-reduction-act/co-pollutant-benefits-searchable-table",
+    "https://www.epa.gov/inflation-reduction-act/lidac-benefits-searchable-table",
+]
+
+
+class EpaPcapArchiver(AbstractDatasetArchiver):
+    """EPA PCAP archiver."""
+
+    name = "epapcap"
+
+    async def get_resources(self) -> ArchiveAwaitable:
+        """Download EPA PCAP resources."""
+        yield self.get_resource()
+
+    async def get_resource(self) -> ResourceInfo:
+        """Download EPA PCAP resources."""
+        zip_path = self.download_directory / "epapcap.zip"
+        data_paths_in_archive = set()
+        # Download the three Excel files first
+        excel_pattern = re.compile(r"priority.*\.xlsx")
+        for link in await self.get_hyperlinks(BASE_URL, excel_pattern):
+            await self.download_helper(link, zip_path, data_paths_in_archive)
+
+        # Download all PDFs from each searchable table
+        pdf_pattern = re.compile(r".*\.pdf")
+        for data_table_url in DATA_TABLE_URLS:
+            for link in await self.get_hyperlinks(data_table_url, pdf_pattern):
+                # The second and third searchable tables links are relative
+                # to the TLD, so we convert them to absolute links
+                prefix = "https://www.epa.gov"
+                if not link.startswith("http"):
+                    link = prefix + link
+                    await self.download_helper(link, zip_path, data_paths_in_archive)
+
+        return ResourceInfo(
+            local_path=zip_path,
+            partitions={},
+            laybout=ZipLayout(file_paths=data_paths_in_archive),
+        )
+
+    async def download_helper(self, link, zip_path, data_paths_in_archive):
+        """Download file and add to archive."""
+        filename = Path(link).name
+        # Do nothing if we're going to end up duplicating a file
+        # Many of the PDFs are shared between the multiple searchable tables
+        if filename in data_paths_in_archive:
+            return
+        download_path = self.download_directory / filename
+        await self.download_file(link, download_path)
+        self.add_to_archive(
+            zip_path=zip_path,
+            filename=filename,
+            blob=download_path.open("rb"),
+        )
+        data_paths_in_archive.add(filename)
+        download_path.unlink()
diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py
@@ -398,4 +398,22 @@
         "license_pudl": LICENSES["cc-by-4.0"],
         "contributors": [CONTRIBUTORS["catalyst-cooperative"]],
     },
+    "epapcap": {
+        "title": "EPA -- Priority Climate Action Plan",
-        "title": "EPA -- Priority Climate Action Plan",
+        "title": "EPA PCAP -- Priority Climate Action Plan",
-        "title": "EPA -- Priority Climate Action Plan",
+        "title": "EPA PCAP -- Priority Climate Action Plan",
+        "path": "https://www.epa.gov/inflation-reduction-act/priority-climate-action-plan-directory",
+        "description": (
+            "EPA’s Priority Climate Action Plan (PCAP) Directory organizes data collected from 211 "
+            "PCAPs submitted by states, Metropolitan Statistical Areas (MSAs), Tribes, and territories "
+            "under EPA’s Climate Pollution Reduction Grants (CPRG) program. PCAPs are a compilation "
+            "of each jurisdiction’s identified priority actions (or measures) to reduce greenhouse "
+            "gas (GHG) emissions. The directory presents information from more than 30 data categories "
+            "related to GHG inventories, GHG reduction measures, benefits for low-income and "
+            "disadvantaged communities (LIDACs), and other PCAP elements."
+        ),
+        "working_partitions": {},
+        "keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg", "emissions"}),
-        "keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg", "emissions"}),
+        "keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg"}),
-        "keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg", "emissions"}),
+        "keywords": sorted({"emissions", "ghg", "epa", "pcap", "cprg"}),
+        "license_raw": LICENSES["us-govt"],
+        "license_pudl": LICENSES["cc-by-4.0"],
+        "contributors": [CONTRIBUTORS["catalyst-cooperative"]],
+    },
 }