From 693714f530f29a30026c0e918cfc13d8b71408ca Mon Sep 17 00:00:00 2001
From: Nilay Kumar <nilaykumar@tutanota.com>
Date: Wed, 22 Jan 2025 14:15:42 -0500
Subject: [PATCH 1/9] Initial work on eiarecs

---
 src/pudl_archiver/archivers/eia/eiarecs.py | 60 ++++++++++++++++++++++
 src/pudl_archiver/metadata/sources.py      | 13 +++++
 2 files changed, 73 insertions(+)
 create mode 100644 src/pudl_archiver/archivers/eia/eiarecs.py

diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py
new file mode 100644
index 00000000..aa62ce15
--- /dev/null
+++ b/src/pudl_archiver/archivers/eia/eiarecs.py
@@ -0,0 +1,60 @@
+"""Archive EIA Residential Energy Consumption Survey (RECS)."""
+
+# TODO:
+# - is the metadata done correctly?
+# - do we want to just grab the zips?
+# - do we want to zip everything up?
+# - how to partition relative to the other tabs?
+# - add in other years of data
+
+import logging
+import re
+
+from pudl_archiver.archivers.classes import (
+    AbstractDatasetArchiver,
+    ArchiveAwaitable,
+    ResourceInfo,
+)
+
+BASE_URL = "https://www.eia.gov/consumption/residential/data"
+logger = logging.getLogger(f"catalystcoop.{__name__}")
+
+
+class EiaRECSArchiver(AbstractDatasetArchiver):
+    """EIA RECS archiver."""
+
+    name = "eiarecs"
+
+    async def get_resources(self) -> ArchiveAwaitable:
+        """Download EIA-RECS resources."""
+        for year in [2020]:
+            yield self.get_year_resources(year)
+
+    async def get_year_resources(self, year: int) -> list[ResourceInfo]:
+        """Download all excel tables for a year."""
+        table_link_pattern = re.compile(r"HC (\d{1,2}).(\d{1,2}).xlsx")
+
+        # Loop through all download links for tables
+        tables = []
+        year_url = f"{BASE_URL}/{year}"
+        for table_link in await self.get_hyperlinks(year_url, table_link_pattern):
+            table_link = f"{year_url}/{table_link}"
+            logger.info(f"Fetching {table_link}")
+            # Get table major/minor number from links
+            match = table_link_pattern.search(table_link)
+            major_num, minor_num = match.group(1), match.group(2)
+
+            # Download file
+            download_path = (
+                self.download_directory
+                / f"eia-recs-{year}-hc-{major_num}-{minor_num}.xlsx"
+            )
+            await self.download_zipfile(table_link, download_path)
+
+            tables.append(
+                ResourceInfo(
+                    local_path=download_path,
+                    partitions={"year": year, "hc": f"{major_num}_{minor_num}"},
+                )
+            )
+        return tables
diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py
index ba63828b..28b8fe31 100644
--- a/src/pudl_archiver/metadata/sources.py
+++ b/src/pudl_archiver/metadata/sources.py
@@ -119,6 +119,7 @@
         "contributors": [CONTRIBUTORS["catalyst-cooperative"]],
     },
     "eiarecs": {
+<<<<<<< HEAD
         "title": "EIA RECS -- Residential Energy Consumption Survey",
         "path": "https://www.eia.gov/consumption/residential/",
         "description": (
@@ -283,6 +284,18 @@
                 "avoided emissions",
             }
         ),
+=======
+        "title": "EIA Residential Energy Consumption Survey",
+        "path": "https://www.eia.gov/consumption/residential/data/2020/",
+        "description": (
+            "EIA Form 457 is commonly known as the Residential Energy Consumption Survey"
+            "(RECS). RECS is a national sample survey that collects detailed information"
+            "on household energy characteristics. The data is tabulated by geography"
+            "housing unit type, income, etc. RECS is conducted roughly every five years."
+        ),
+        "working_partitions": {"years": [2020, 2015, 2009, 2005, 2001, 1997, 1993]},
+        "keywords": sorted({"residential", "RECS"}),
+>>>>>>> 2a0933e (Initial work on eiarecs)
         "license_raw": LICENSES["us-govt"],
         "license_pudl": LICENSES["cc-by-4.0"],
         "contributors": [CONTRIBUTORS["catalyst-cooperative"]],

From de35d0e40a150a816e128d131e18856e6453153c Mon Sep 17 00:00:00 2001
From: Nilay Kumar <nilaykumar@tutanota.com>
Date: Wed, 22 Jan 2025 14:25:25 -0500
Subject: [PATCH 2/9] Forgot to save the rebase edit (awkward)

---
 src/pudl_archiver/metadata/sources.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py
index 28b8fe31..ba63828b 100644
--- a/src/pudl_archiver/metadata/sources.py
+++ b/src/pudl_archiver/metadata/sources.py
@@ -119,7 +119,6 @@
         "contributors": [CONTRIBUTORS["catalyst-cooperative"]],
     },
     "eiarecs": {
-<<<<<<< HEAD
         "title": "EIA RECS -- Residential Energy Consumption Survey",
         "path": "https://www.eia.gov/consumption/residential/",
         "description": (
@@ -284,18 +283,6 @@
                 "avoided emissions",
             }
         ),
-=======
-        "title": "EIA Residential Energy Consumption Survey",
-        "path": "https://www.eia.gov/consumption/residential/data/2020/",
-        "description": (
-            "EIA Form 457 is commonly known as the Residential Energy Consumption Survey"
-            "(RECS). RECS is a national sample survey that collects detailed information"
-            "on household energy characteristics. The data is tabulated by geography"
-            "housing unit type, income, etc. RECS is conducted roughly every five years."
-        ),
-        "working_partitions": {"years": [2020, 2015, 2009, 2005, 2001, 1997, 1993]},
-        "keywords": sorted({"residential", "RECS"}),
->>>>>>> 2a0933e (Initial work on eiarecs)
         "license_raw": LICENSES["us-govt"],
         "license_pudl": LICENSES["cc-by-4.0"],
         "contributors": [CONTRIBUTORS["catalyst-cooperative"]],

From 274b7ddf9e944cf8e6c737455748194368309311 Mon Sep 17 00:00:00 2001
From: Nilay Kumar <nilaykumar@tutanota.com>
Date: Wed, 22 Jan 2025 17:17:48 -0500
Subject: [PATCH 3/9] Added consumption and state data

---
 src/pudl_archiver/archivers/eia/eiarecs.py | 85 +++++++++++++++-------
 1 file changed, 60 insertions(+), 25 deletions(-)

diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py
index aa62ce15..597085c7 100644
--- a/src/pudl_archiver/archivers/eia/eiarecs.py
+++ b/src/pudl_archiver/archivers/eia/eiarecs.py
@@ -1,8 +1,8 @@
 """Archive EIA Residential Energy Consumption Survey (RECS)."""
 
 # TODO:
-# - is the metadata done correctly?
-# - do we want to just grab the zips?
+# - grab all the data and then zip it up
+# - make sure we're not missing anything with like ce1.2a.xlsx
 # - do we want to zip everything up?
 # - how to partition relative to the other tabs?
 # - add in other years of data
@@ -16,7 +16,33 @@
     ResourceInfo,
 )
 
-BASE_URL = "https://www.eia.gov/consumption/residential/data"
+LINK_PATTERNS = [
+    {
+        "base_url": "https://www.eia.gov/consumption/residential/data",
+        "php_extension": "index.php?view=characteristics",
+        "prefix": "hc",
+        "pattern": re.compile(r"HC (\d{1,2})\.(\d{1,2})\.xlsx"),
+    },
+    {
+        "base_url": "https://www.eia.gov/consumption/residential/data",
+        "php_extension": "index.php?view=consumption",
+        "prefix": "ce",
+        "pattern": re.compile(r"ce(\d)\.(\d{1,2})[a-z]?\.xlsx"),
+    },
+    {
+        "base_url": "https://www.eia.gov/consumption/residential/data",
+        "php_extension": "index.php?view=state",
+        "prefix": "state",
+        "pattern": re.compile(r"State (.*)\.xlsx"),
+        "no_version": True,
+    },
+    {
+        "base_url": "https://www.eia.gov/consumption/residential/data",
+        "php_extension": "index.php?view=state",
+        "prefix": "state-ce",
+        "pattern": re.compile(r"ce(\d)\.(\d{1,2})\.(.*)\.xlsx"),
+    },
+]
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
@@ -32,29 +58,38 @@ async def get_resources(self) -> ArchiveAwaitable:
 
     async def get_year_resources(self, year: int) -> list[ResourceInfo]:
         """Download all excel tables for a year."""
-        table_link_pattern = re.compile(r"HC (\d{1,2}).(\d{1,2}).xlsx")
-
         # Loop through all download links for tables
         tables = []
-        year_url = f"{BASE_URL}/{year}"
-        for table_link in await self.get_hyperlinks(year_url, table_link_pattern):
-            table_link = f"{year_url}/{table_link}"
-            logger.info(f"Fetching {table_link}")
-            # Get table major/minor number from links
-            match = table_link_pattern.search(table_link)
-            major_num, minor_num = match.group(1), match.group(2)
-
-            # Download file
-            download_path = (
-                self.download_directory
-                / f"eia-recs-{year}-hc-{major_num}-{minor_num}.xlsx"
-            )
-            await self.download_zipfile(table_link, download_path)
-
-            tables.append(
-                ResourceInfo(
-                    local_path=download_path,
-                    partitions={"year": year, "hc": f"{major_num}_{minor_num}"},
+        for pattern_dict in LINK_PATTERNS:
+            year_url = f"{pattern_dict['base_url']}/{year}"
+            url = f"{year_url}/{pattern_dict['php_extension']}"
+            table_link_pattern = pattern_dict["pattern"]
+            for table_link in await self.get_hyperlinks(url, table_link_pattern):
+                table_link = f"{year_url}/{table_link}"
+                logger.info(f"Fetching {table_link}")
+                # Get table major/minor number from links
+                match = table_link_pattern.search(table_link)
+                output_filename = f"eia-recs-{year}-{pattern_dict['prefix']}"
+                if "no_version" in pattern_dict and pattern_dict["no_version"]:
+                    output_filename += "-" + match.group(1).lower().replace(" ", "-")
+                else:
+                    major_num, minor_num = (
+                        match.group(1),
+                        match.group(2),
+                    )
+                    output_filename += f"-{major_num}-{minor_num}"
+                if len(match.groups()) >= 3:
+                    output_filename += "-" + match.group(3)
+                output_filename += ".xlsx"
+
+                # Download file
+                download_path = self.download_directory / output_filename
+                await self.download_zipfile(table_link, download_path)
+
+                tables.append(
+                    ResourceInfo(
+                        local_path=download_path,
+                        partitions={"year": year},
+                    )
                 )
-            )
         return tables

From abd35668337284a408e6ad1d1615ebe017942a93 Mon Sep 17 00:00:00 2001
From: Nilay Kumar <nilaykumar@tutanota.com>
Date: Wed, 22 Jan 2025 18:43:27 -0500
Subject: [PATCH 4/9] Adding files to zip

---
 src/pudl_archiver/archivers/eia/eiarecs.py | 48 +++++++++++++++-------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py
index 597085c7..679a3182 100644
--- a/src/pudl_archiver/archivers/eia/eiarecs.py
+++ b/src/pudl_archiver/archivers/eia/eiarecs.py
@@ -1,12 +1,5 @@
 """Archive EIA Residential Energy Consumption Survey (RECS)."""
 
-# TODO:
-# - grab all the data and then zip it up
-# - make sure we're not missing anything with like ce1.2a.xlsx
-# - do we want to zip everything up?
-# - how to partition relative to the other tabs?
-# - add in other years of data
-
 import logging
 import re
 
@@ -15,33 +8,45 @@
     ArchiveAwaitable,
     ResourceInfo,
 )
+from pudl_archiver.frictionless import ZipLayout
 
 LINK_PATTERNS = [
+    # housing characteristics
     {
         "base_url": "https://www.eia.gov/consumption/residential/data",
         "php_extension": "index.php?view=characteristics",
         "prefix": "hc",
         "pattern": re.compile(r"HC (\d{1,2})\.(\d{1,2})\.xlsx"),
     },
+    # consumption & expenditures
     {
         "base_url": "https://www.eia.gov/consumption/residential/data",
         "php_extension": "index.php?view=consumption",
         "prefix": "ce",
-        "pattern": re.compile(r"ce(\d)\.(\d{1,2})[a-z]?\.xlsx"),
+        "pattern": re.compile(r"ce(\d)\.(\d{1,2})([a-z]?)\.xlsx"),
     },
+    # state data (housing characteristics)
     {
         "base_url": "https://www.eia.gov/consumption/residential/data",
         "php_extension": "index.php?view=state",
         "prefix": "state",
         "pattern": re.compile(r"State (.*)\.xlsx"),
-        "no_version": True,
     },
+    # state data (consumption & expenditures)
     {
         "base_url": "https://www.eia.gov/consumption/residential/data",
         "php_extension": "index.php?view=state",
         "prefix": "state-ce",
         "pattern": re.compile(r"ce(\d)\.(\d{1,2})\.(.*)\.xlsx"),
     },
+    # microdata
+    # adding this in will require major changes+cleanup to the code below
+    # {
+    #    "base_url": "https://www.eia.gov/consumption/residential/data",
+    #    "php_extension": "index.php?view=microdata",
+    #    "prefix": "udata",
+    #    "pattern": re.compile(r"(recs.*\d{4}.*public.*)\.(?:zip|csv|xlsx)", re.IGNORECASE),
+    # }
 ]
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -60,7 +65,11 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
         """Download all excel tables for a year."""
         # Loop through all download links for tables
         tables = []
+        zip_path = self.download_directory / f"eia-recs-{year}.zip"
+        data_paths_in_archive = set()
+        # Loop through different categories of data (all .xlsx)
         for pattern_dict in LINK_PATTERNS:
+            # Each category of data has its own url, etc.
             year_url = f"{pattern_dict['base_url']}/{year}"
             url = f"{year_url}/{pattern_dict['php_extension']}"
             table_link_pattern = pattern_dict["pattern"]
@@ -69,27 +78,38 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
                 logger.info(f"Fetching {table_link}")
                 # Get table major/minor number from links
                 match = table_link_pattern.search(table_link)
+                # We've gotta do a bit of wrangling to get the output filename
+                # to match the url somewhat
+                n_groups = len(match.groups())
                 output_filename = f"eia-recs-{year}-{pattern_dict['prefix']}"
-                if "no_version" in pattern_dict and pattern_dict["no_version"]:
-                    output_filename += "-" + match.group(1).lower().replace(" ", "-")
+                if n_groups == 1:
+                    output_filename += "-" + match.group(1).lower().replace(" ", "_")
                 else:
                     major_num, minor_num = (
                         match.group(1),
                         match.group(2),
                     )
                     output_filename += f"-{major_num}-{minor_num}"
-                if len(match.groups()) >= 3:
+                if n_groups == 3 and match.group(3) != "":
                     output_filename += "-" + match.group(3)
                 output_filename += ".xlsx"
 
                 # Download file
                 download_path = self.download_directory / output_filename
-                await self.download_zipfile(table_link, download_path)
+                await self.download_file(table_link, download_path)
+                self.add_to_archive(
+                    zip_path=zip_path,
+                    filename=output_filename,
+                    blob=download_path.open("rb"),
+                )
+                data_paths_in_archive.add(output_filename)
+                download_path.unlink()
 
                 tables.append(
                     ResourceInfo(
-                        local_path=download_path,
+                        local_path=zip_path,
                         partitions={"year": year},
+                        layout=ZipLayout(file_paths=data_paths_in_archive),
                     )
                 )
         return tables

From 67cbc8ce9affac23bc124387d124c82efc411266 Mon Sep 17 00:00:00 2001
From: Nilay Kumar <nilaykumar@tutanota.com>
Date: Wed, 22 Jan 2025 18:54:12 -0500
Subject: [PATCH 5/9] Removing accidental zipping in loop

---
 src/pudl_archiver/archivers/eia/eiarecs.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py
index 679a3182..310f4034 100644
--- a/src/pudl_archiver/archivers/eia/eiarecs.py
+++ b/src/pudl_archiver/archivers/eia/eiarecs.py
@@ -105,11 +105,11 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
                 data_paths_in_archive.add(output_filename)
                 download_path.unlink()
 
-                tables.append(
-                    ResourceInfo(
-                        local_path=zip_path,
-                        partitions={"year": year},
-                        layout=ZipLayout(file_paths=data_paths_in_archive),
-                    )
-                )
+        tables.append(
+            ResourceInfo(
+                local_path=zip_path,
+                partitions={"year": year},
+                layout=ZipLayout(file_paths=data_paths_in_archive),
+            )
+        )
         return tables

From b08b0e1cc333634c9fba0aaa2084dabc689d9cd8 Mon Sep 17 00:00:00 2001
From: Dazhong Xia <dazhong.xia@catalyst.coop>
Date: Wed, 29 Jan 2025 16:12:52 -0500
Subject: [PATCH 6/9] chore: replace output filename munging code

---
 src/pudl_archiver/archivers/eia/eiarecs.py | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py
index 310f4034..3fa27105 100644
--- a/src/pudl_archiver/archivers/eia/eiarecs.py
+++ b/src/pudl_archiver/archivers/eia/eiarecs.py
@@ -78,21 +78,12 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
                 logger.info(f"Fetching {table_link}")
                 # Get table major/minor number from links
                 match = table_link_pattern.search(table_link)
-                # We've gotta do a bit of wrangling to get the output filename
-                # to match the url somewhat
-                n_groups = len(match.groups())
-                output_filename = f"eia-recs-{year}-{pattern_dict['prefix']}"
-                if n_groups == 1:
-                    output_filename += "-" + match.group(1).lower().replace(" ", "_")
-                else:
-                    major_num, minor_num = (
-                        match.group(1),
-                        match.group(2),
-                    )
-                    output_filename += f"-{major_num}-{minor_num}"
-                if n_groups == 3 and match.group(3) != "":
-                    output_filename += "-" + match.group(3)
-                output_filename += ".xlsx"
+                matched_metadata = (
+                    "-".join(g for g in match.groups() if g).replace(" ", "_").lower()
+                )
+                output_filename = (
+                    f"eia-recs-{year}-{pattern_dict['prefix']}-{matched_metadata}.xlsx"
+                )
 
                 # Download file
                 download_path = self.download_directory / output_filename

From 1b874173074d6fa1d109e59907f12aee79719c58 Mon Sep 17 00:00:00 2001
From: Dazhong Xia <dazhong.xia@catalyst.coop>
Date: Wed, 29 Jan 2025 18:13:31 -0500
Subject: [PATCH 7/9] feat: add 2020 microdata + methodology

---
 src/pudl_archiver/archivers/eia/eiarecs.py | 134 ++++++++++++---------
 1 file changed, 78 insertions(+), 56 deletions(-)

diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py
index 3fa27105..81fd611d 100644
--- a/src/pudl_archiver/archivers/eia/eiarecs.py
+++ b/src/pudl_archiver/archivers/eia/eiarecs.py
@@ -2,6 +2,9 @@
 
 import logging
 import re
+from dataclasses import dataclass
+from io import BytesIO
+from urllib.parse import urljoin
 
 from pudl_archiver.archivers.classes import (
     AbstractDatasetArchiver,
@@ -10,44 +13,60 @@
 )
 from pudl_archiver.frictionless import ZipLayout
 
-LINK_PATTERNS = [
-    # housing characteristics
-    {
-        "base_url": "https://www.eia.gov/consumption/residential/data",
-        "php_extension": "index.php?view=characteristics",
-        "prefix": "hc",
-        "pattern": re.compile(r"HC (\d{1,2})\.(\d{1,2})\.xlsx"),
-    },
-    # consumption & expenditures
-    {
-        "base_url": "https://www.eia.gov/consumption/residential/data",
-        "php_extension": "index.php?view=consumption",
-        "prefix": "ce",
-        "pattern": re.compile(r"ce(\d)\.(\d{1,2})([a-z]?)\.xlsx"),
-    },
-    # state data (housing characteristics)
-    {
-        "base_url": "https://www.eia.gov/consumption/residential/data",
-        "php_extension": "index.php?view=state",
-        "prefix": "state",
-        "pattern": re.compile(r"State (.*)\.xlsx"),
-    },
-    # state data (consumption & expenditures)
-    {
-        "base_url": "https://www.eia.gov/consumption/residential/data",
-        "php_extension": "index.php?view=state",
-        "prefix": "state-ce",
-        "pattern": re.compile(r"ce(\d)\.(\d{1,2})\.(.*)\.xlsx"),
-    },
-    # microdata
-    # adding this in will require major changes+cleanup to the code below
-    # {
-    #    "base_url": "https://www.eia.gov/consumption/residential/data",
-    #    "php_extension": "index.php?view=microdata",
-    #    "prefix": "udata",
-    #    "pattern": re.compile(r"(recs.*\d{4}.*public.*)\.(?:zip|csv|xlsx)", re.IGNORECASE),
-    # }
-]
+
+@dataclass
+class LinkSet:
+    """Information a set of links in one tab of the RECS viewer.
+
+    See https://www.eia.gov/consumption/residential/data/2020/.
+    """
+
+    url: str
+    short_name: str
+    pattern: re.Pattern
+
+
+def _url_for(year: int, view: str):
+    """Get the URL for a specific RECS year/tab combo."""
+    return (
+        f"https://www.eia.gov/consumption/residential/data/{year}/index.php?view={view}"
+    )
+
+
+YEAR_LINK_SETS = {
+    2020: {
+        "housing_characteristics": LinkSet(
+            url=_url_for(year=2020, view="characteristics"),
+            short_name="hc",
+            pattern=re.compile(r"HC (\d{1,2}\.\d{1,2})\.(xlsx)"),
+        ),
+        "consumption & expenditures": LinkSet(
+            url=_url_for(year=2020, view="consumption"),
+            short_name="ce",
+            pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.(xlsx)"),
+        ),
+        "state data (housing characteristics)": LinkSet(
+            url=_url_for(year=2020, view="state"),
+            short_name="state",
+            pattern=re.compile(r"State (.*)\.(xlsx)"),
+        ),
+        "state data (consumption & expenditures)": LinkSet(
+            url=_url_for(year=2020, view="state"),
+            short_name="state-ce",
+            pattern=re.compile(r"ce(\d\.\d{1,2}\..*)\.(xlsx)"),
+        ),
+        "microdata": LinkSet(
+            url=_url_for(year=2020, view="microdata"),
+            short_name="microdata",
+            pattern=re.compile(r"(recs.*public.*)\.(csv)"),
+        ),
+        "methodology": LinkSet(
+            url=_url_for(year=2020, view="methodology"),
+            short_name="methodology",
+            pattern=re.compile(r"pdf/(.+)\.(pdf)"),
+        ),
+    }
+}
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
@@ -61,6 +80,11 @@ async def get_resources(self) -> ArchiveAwaitable:
         for year in [2020]:
             yield self.get_year_resources(year)
 
+    def __is_html_file(self, fileobj: BytesIO) -> bool:
+        header = fileobj.read(30).lower().strip()
+        fileobj.seek(0)
+        return b"<!doctype html>" in header
+
     async def get_year_resources(self, year: int) -> list[ResourceInfo]:
         """Download all excel tables for a year."""
         # Loop through all download links for tables
@@ -68,31 +92,29 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
         zip_path = self.download_directory / f"eia-recs-{year}.zip"
         data_paths_in_archive = set()
         # Loop through different categories of data (all .xlsx)
-        for pattern_dict in LINK_PATTERNS:
-            # Each category of data has its own url, etc.
-            year_url = f"{pattern_dict['base_url']}/{year}"
-            url = f"{year_url}/{pattern_dict['php_extension']}"
-            table_link_pattern = pattern_dict["pattern"]
-            for table_link in await self.get_hyperlinks(url, table_link_pattern):
-                table_link = f"{year_url}/{table_link}"
+        link_sets = YEAR_LINK_SETS[year]
+        for link_set in link_sets.values():
+            for table_link in await self.get_hyperlinks(link_set.url, link_set.pattern):
+                table_link = urljoin(link_set.url, table_link)
                 logger.info(f"Fetching {table_link}")
-                # Get table major/minor number from links
-                match = table_link_pattern.search(table_link)
+                match = link_set.pattern.search(table_link)
                 matched_metadata = (
-                    "-".join(g for g in match.groups() if g).replace(" ", "_").lower()
-                )
-                output_filename = (
-                    f"eia-recs-{year}-{pattern_dict['prefix']}-{matched_metadata}.xlsx"
+                    match.group(1).replace(".", "-").replace(" ", "_").lower()
                 )
+                matched_format = match.group(2)
+                output_filename = f"eia-recs-{year}-{link_set.short_name}-{matched_metadata}.{matched_format}"
 
                 # Download file
                 download_path = self.download_directory / output_filename
                 await self.download_file(table_link, download_path)
-                self.add_to_archive(
-                    zip_path=zip_path,
-                    filename=output_filename,
-                    blob=download_path.open("rb"),
-                )
+                with download_path.open("rb") as f:
+                    if self.__is_html_file(f):
+                        continue
+                    self.add_to_archive(
+                        zip_path=zip_path,
+                        filename=output_filename,
+                        blob=f,
+                    )
                 data_paths_in_archive.add(output_filename)
                 download_path.unlink()
 

From f06f7e3b938de545a80460cdf0d62329687813ff Mon Sep 17 00:00:00 2001
From: Dazhong Xia <dazhong.xia@catalyst.coop>
Date: Wed, 29 Jan 2025 18:40:59 -0500
Subject: [PATCH 8/9] feat: add 2015

2015 methodology required some changes to allow for downloading html files.
---
 src/pudl_archiver/archivers/eia/eiarecs.py | 99 ++++++++++++++++------
 1 file changed, 75 insertions(+), 24 deletions(-)

diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py
index 81fd611d..8f537d85 100644
--- a/src/pudl_archiver/archivers/eia/eiarecs.py
+++ b/src/pudl_archiver/archivers/eia/eiarecs.py
@@ -21,9 +21,11 @@ class LinkSet:
     See https://www.eia.gov/consumption/residential/data/2020/.
     """
 
-    url: str
+    view: str
     short_name: str
+    extension: str
     pattern: re.Pattern
+    skip_if_html: bool = True
 
 
 def _url_for(year: int, view: str):
@@ -36,36 +38,81 @@ def _url_for(year: int, view: str):
 YEAR_LINK_SETS = {
     2020: {
         "housing_characteristics": LinkSet(
-            url=_url_for(year=2020, view="characteristics"),
+            view="characteristics",
             short_name="hc",
-            pattern=re.compile(r"HC (\d{1,2}\.\d{1,2})\.(xlsx)"),
+            pattern=re.compile(r"HC (\d{1,2}\.\d{1,2})\.xlsx"),
+            extension="xlsx",
         ),
         "consumption & expenditures": LinkSet(
-            url=_url_for(year=2020, view="consumption"),
+            view="consumption",
             short_name="ce",
-            pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.(xlsx)"),
+            pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.xlsx"),
+            extension="xlsx",
         ),
         "state data (housing characteristics)": LinkSet(
-            url=_url_for(year=2020, view="state"),
-            short_name="state",
-            pattern=re.compile(r"State (.*)\.(xlsx)"),
+            view="state",
+            short_name="state-hc",
+            pattern=re.compile(r"State (.*)\.xlsx"),
+            extension="xlsx",
         ),
         "state data (consumption & expenditures)": LinkSet(
-            url=_url_for(year=2020, view="state"),
+            view="state",
             short_name="state-ce",
-            pattern=re.compile(r"ce(\d\.\d{1,2}\..*)\.(xlsx)"),
+            pattern=re.compile(r"ce(\d\.\d{1,2}\..*)\.xlsx"),
+            extension="xlsx",
         ),
         "microdata": LinkSet(
-            url=_url_for(year=2020, view="microdata"),
+            view="microdata",
             short_name="microdata",
-            pattern=re.compile(r"(recs.*public.*)\.(csv)"),
+            pattern=re.compile(r"(recs.*public.*)\.csv"),
+            extension="csv",
+        ),
+        "microdata-codebook": LinkSet(
+            view="microdata",
+            short_name="microdata",
+            pattern=re.compile(r"(RECS 2020 Codebook.*v.)\.xlsx"),
+            extension="xlsx",
+        ),
+        "methodology": LinkSet(
+            view="methodology",
+            short_name="methodology",
+            pattern=re.compile(r"pdf/(.+)\.pdf"),
+            extension="pdf",
+        ),
+    },
+    2015: {
+        "housing_characteristics": LinkSet(
+            view="characteristics",
+            short_name="hc",
+            pattern=re.compile(r"hc(\d{1,2}\.\d{1,2})\.xlsx"),
+            extension="xlsx",
+        ),
+        "consumption & expenditures": LinkSet(
+            view="consumption",
+            short_name="ce",
+            pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.xlsx"),
+            extension="xlsx",
+        ),
+        "microdata": LinkSet(
+            view="microdata",
+            short_name="microdata",
+            pattern=re.compile(r"(recs.*public.*)\.csv"),
+            extension="csv",
+        ),
+        "microdata-codebook": LinkSet(
+            view="microdata",
+            short_name="microdata",
+            pattern=re.compile(r"(codebook.*)\.xlsx"),
+            extension="xlsx",
         ),
         "methodology": LinkSet(
-            url=_url_for(year=2020, view="methodology"),
+            view="methodology",
             short_name="methodology",
-            pattern=re.compile(r"pdf/(.+)\.(pdf)"),
+            pattern=re.compile(r"/consumption/residential/reports/2015/(.+)(\.php)?"),
+            extension="html",
+            skip_if_html=False,
         ),
-    }
+    },
 }
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -77,13 +124,13 @@ class EiaRECSArchiver(AbstractDatasetArchiver):
 
     async def get_resources(self) -> ArchiveAwaitable:
         """Download EIA-RECS resources."""
-        for year in [2020]:
+        for year in [2020, 2015]:
             yield self.get_year_resources(year)
 
     def __is_html_file(self, fileobj: BytesIO) -> bool:
         header = fileobj.read(30).lower().strip()
         fileobj.seek(0)
-        return b"<!doctype html>" in header
+        return b"<!doctype html" in header
 
     async def get_year_resources(self, year: int) -> list[ResourceInfo]:
         """Download all excel tables for a year."""
@@ -94,21 +141,25 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
         # Loop through different categories of data (all .xlsx)
         link_sets = YEAR_LINK_SETS[year]
         for link_set in link_sets.values():
-            for table_link in await self.get_hyperlinks(link_set.url, link_set.pattern):
-                table_link = urljoin(link_set.url, table_link)
+            url = _url_for(year, link_set.view)
+            for table_link in await self.get_hyperlinks(url, link_set.pattern):
+                table_link = urljoin(url, table_link).strip("/")
                 logger.info(f"Fetching {table_link}")
                 match = link_set.pattern.search(table_link)
-                matched_metadata = (
-                    match.group(1).replace(".", "-").replace(" ", "_").lower()
+                matched_filename = (
+                    match.group(1)
+                    .replace(".", "-")
+                    .replace(" ", "_")
+                    .replace("/", "-")
+                    .lower()
                 )
-                matched_format = match.group(2)
-                output_filename = f"eia-recs-{year}-{link_set.short_name}-{matched_metadata}.{matched_format}"
+                output_filename = f"eia-recs-{year}-{link_set.short_name}-{matched_filename}.{link_set.extension}"
 
                 # Download file
                 download_path = self.download_directory / output_filename
                 await self.download_file(table_link, download_path)
                 with download_path.open("rb") as f:
-                    if self.__is_html_file(f):
+                    if link_set.skip_if_html and self.__is_html_file(f):
                         continue
                     self.add_to_archive(
                         zip_path=zip_path,

From ed70331dd835564211fb1533707932355aabcf82 Mon Sep 17 00:00:00 2001
From: Dazhong Xia <dazhong.xia@catalyst.coop>
Date: Wed, 29 Jan 2025 19:14:20 -0500
Subject: [PATCH 9/9] feat: add 2009 and historical 457 forms

---
 src/pudl_archiver/archivers/eia/eiarecs.py | 88 +++++++++++++++++-----
 1 file changed, 70 insertions(+), 18 deletions(-)

diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py
index 8f537d85..c57fb267 100644
--- a/src/pudl_archiver/archivers/eia/eiarecs.py
+++ b/src/pudl_archiver/archivers/eia/eiarecs.py
@@ -13,6 +13,8 @@
 )
 from pudl_archiver.frictionless import ZipLayout
 
+logger = logging.getLogger(f"catalystcoop.{__name__}")
+
 
 @dataclass
 class LinkSet:
@@ -21,7 +23,7 @@ class LinkSet:
     See https://www.eia.gov/consumption/residential/data/2020/.
     """
 
-    view: str
+    url: str
     short_name: str
     extension: str
     pattern: re.Pattern
@@ -35,86 +37,137 @@ def _url_for(year: int, view: str):
     )
 
 
+# Each year, each tab's format changes. Rather than have complicated regexes that capture everything, just have lots of simple regexes
 YEAR_LINK_SETS = {
     2020: {
         "housing_characteristics": LinkSet(
-            view="characteristics",
+            url=_url_for(year=2020, view="characteristics"),
             short_name="hc",
             pattern=re.compile(r"HC (\d{1,2}\.\d{1,2})\.xlsx"),
             extension="xlsx",
         ),
         "consumption & expenditures": LinkSet(
-            view="consumption",
+            url=_url_for(year=2020, view="consumption"),
             short_name="ce",
             pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.xlsx"),
             extension="xlsx",
         ),
         "state data (housing characteristics)": LinkSet(
-            view="state",
+            url=_url_for(year=2020, view="state"),
             short_name="state-hc",
             pattern=re.compile(r"State (.*)\.xlsx"),
             extension="xlsx",
         ),
         "state data (consumption & expenditures)": LinkSet(
-            view="state",
+            url=_url_for(year=2020, view="state"),
             short_name="state-ce",
             pattern=re.compile(r"ce(\d\.\d{1,2}\..*)\.xlsx"),
             extension="xlsx",
         ),
         "microdata": LinkSet(
-            view="microdata",
+            url=_url_for(year=2020, view="microdata"),
             short_name="microdata",
             pattern=re.compile(r"(recs.*public.*)\.csv"),
             extension="csv",
         ),
         "microdata-codebook": LinkSet(
-            view="microdata",
+            url=_url_for(year=2020, view="microdata"),
             short_name="microdata",
             pattern=re.compile(r"(RECS 2020 Codebook.*v.)\.xlsx"),
             extension="xlsx",
         ),
         "methodology": LinkSet(
-            view="methodology",
+            url=_url_for(year=2020, view="methodology"),
             short_name="methodology",
             pattern=re.compile(r"pdf/(.+)\.pdf"),
             extension="pdf",
         ),
+        "methodology-forms": LinkSet(
+            url="https://www.eia.gov/survey/#eia-457",
+            short_name="methodology",
+            pattern=re.compile(r"eia_457/archive/2020_(.+)\.pdf"),
+            extension="pdf",
+        ),
     },
     2015: {
         "housing_characteristics": LinkSet(
-            view="characteristics",
+            url=_url_for(year=2015, view="characteristics"),
             short_name="hc",
             pattern=re.compile(r"hc(\d{1,2}\.\d{1,2})\.xlsx"),
             extension="xlsx",
         ),
         "consumption & expenditures": LinkSet(
-            view="consumption",
+            url=_url_for(year=2015, view="consumption"),
             short_name="ce",
             pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.xlsx"),
             extension="xlsx",
         ),
         "microdata": LinkSet(
-            view="microdata",
+            url=_url_for(year=2015, view="microdata"),
             short_name="microdata",
             pattern=re.compile(r"(recs.*public.*)\.csv"),
             extension="csv",
         ),
         "microdata-codebook": LinkSet(
-            view="microdata",
+            url=_url_for(year=2015, view="microdata"),
+            short_name="microdata",
+            pattern=re.compile(r"(codebook.*)\.xlsx"),
+            extension="xlsx",
+        ),
+        "methodology": LinkSet(
+            url=_url_for(year=2015, view="methodology"),
+            short_name="methodology",
+            pattern=re.compile(r"/consumption/residential/reports/2015/(.+)(\.php)?"),
+            extension="html",
+            skip_if_html=False,
+        ),
+        "methodology-forms": LinkSet(
+            url="https://www.eia.gov/survey/#eia-457",
+            short_name="methodology",
+            pattern=re.compile(r"eia_457/archive/2015_(.+)\.pdf"),
+            extension="pdf",
+        ),
+    },
+    2009: {
+        "housing_characteristics": LinkSet(
+            url=_url_for(year=2009, view="characteristics"),
+            short_name="hc",
+            pattern=re.compile(r"hc(\d{1,2}\.\d{1,2})\.xlsx"),
+            extension="xlsx",
+        ),
+        "consumption & expenditures": LinkSet(
+            url=_url_for(year=2009, view="consumption"),
+            short_name="ce",
+            pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.xlsx"),
+            extension="xlsx",
+        ),
+        "microdata": LinkSet(
+            url=_url_for(year=2009, view="microdata"),
+            short_name="microdata",
+            pattern=re.compile(r"csv/(.*)\.csv"),
+            extension="csv",
+        ),
+        "microdata-codebook": LinkSet(
+            url=_url_for(year=2009, view="microdata"),
             short_name="microdata",
             pattern=re.compile(r"(codebook.*)\.xlsx"),
             extension="xlsx",
         ),
         "methodology": LinkSet(
-            view="methodology",
+            url=_url_for(year=2009, view="methodology"),
             short_name="methodology",
             pattern=re.compile(r"/consumption/residential/reports/2015/(.+)(\.php)?"),
             extension="html",
             skip_if_html=False,
         ),
+        "methodology-forms": LinkSet(
+            url="https://www.eia.gov/survey/#eia-457",
+            short_name="methodology",
+            pattern=re.compile(r"eia_457/archive/2009 (.+)\.pdf"),
+            extension="pdf",
+        ),
     },
 }
-logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
 class EiaRECSArchiver(AbstractDatasetArchiver):
@@ -124,7 +177,7 @@ class EiaRECSArchiver(AbstractDatasetArchiver):
 
     async def get_resources(self) -> ArchiveAwaitable:
         """Download EIA-RECS resources."""
-        for year in [2020, 2015]:
+        for year in [2020, 2015, 2009]:
             yield self.get_year_resources(year)
 
     def __is_html_file(self, fileobj: BytesIO) -> bool:
@@ -141,9 +194,8 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
         # Loop through different categories of data (all .xlsx)
         link_sets = YEAR_LINK_SETS[year]
         for link_set in link_sets.values():
-            url = _url_for(year, link_set.view)
-            for table_link in await self.get_hyperlinks(url, link_set.pattern):
-                table_link = urljoin(url, table_link).strip("/")
+            for table_link in await self.get_hyperlinks(link_set.url, link_set.pattern):
+                table_link = urljoin(link_set.url, table_link).strip("/")
                 logger.info(f"Fetching {table_link}")
                 match = link_set.pattern.search(table_link)
                 matched_filename = (