diff --git a/.github/workflows/run-archiver.yml b/.github/workflows/run-archiver.yml
index a5e2e621..a643a9b7 100644
--- a/.github/workflows/run-archiver.yml
+++ b/.github/workflows/run-archiver.yml
@@ -6,7 +6,7 @@ on:
     inputs:
       datasets:
         description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").'
-        default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"'
+        default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"'
         required: true
         type: string
       create_github_issue:
@@ -26,7 +26,7 @@ jobs:
     strategy:
       matrix:
         # Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here.
-        dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }}
+        dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }}
       fail-fast: false
     runs-on: ubuntu-latest
     permissions:
diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml
index 2a2b5889..5c975ca6 100644
--- a/.github/workflows/tox-pytest.yml
+++ b/.github/workflows/tox-pytest.yml
@@ -1,7 +1,9 @@
 ---
 name: tox-pytest
 
-on: push
+on:
+  push:
+  pull_request:
 
 jobs:
   ci-test:
diff --git a/pyproject.toml b/pyproject.toml
index e0ebf1aa..095ddf21 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,6 +6,7 @@ requires-python = ">=3.12,<3.13"
 
 dependencies = [
     "arelle-release>=2.29,<2.37",
+    "beautifulsoup4>=4.12.3,<5",
     "catalystcoop.pudl @ git+https://github.com/catalyst-cooperative/pudl.git",
     "coloredlogs>=14",
     "dask>=2024",
diff --git a/src/pudl_archiver/archivers/classes.py b/src/pudl_archiver/archivers/classes.py
index 512a0230..897a6012 100644
--- a/src/pudl_archiver/archivers/classes.py
+++ b/src/pudl_archiver/archivers/classes.py
@@ -14,6 +14,7 @@
 from pathlib import Path
 
 import aiohttp
+import bs4
 import pandas as pd
 
 from pudl_archiver.archivers import validate
@@ -129,6 +130,13 @@ def __init__(
         self.logger = logging.getLogger(f"catalystcoop.{__name__}")
         self.logger.info(f"Archiving {self.name}")
 
+    async def get_soup(self, url: str) -> bs4.BeautifulSoup:
+        """Get a BeautifulSoup instance for a URL using our existing session."""
+        response = await retry_async(self.session.get, args=[url])
+        # TODO 2025-02-03: for some reason, lxml fails to grab the closing div
+        # tag for tab content - so we use html.parser, which is slower.
+        return bs4.BeautifulSoup(await response.text(), "html.parser")
+
     @abstractmethod
     def get_resources(self) -> ArchiveAwaitable:
         """Abstract method that each data source must implement to download all resources.
diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py
new file mode 100644
index 00000000..a3fa695d
--- /dev/null
+++ b/src/pudl_archiver/archivers/eia/eiarecs.py
@@ -0,0 +1,271 @@
+"""Archive EIA Residential Energy Consumption Survey (RECS)."""
+
+import re
+from dataclasses import dataclass
+from io import BytesIO
+from pathlib import Path
+from urllib.parse import urljoin, urlparse
+
+from pudl_archiver.archivers.classes import (
+    AbstractDatasetArchiver,
+    ArchiveAwaitable,
+    ResourceInfo,
+)
+from pudl_archiver.frictionless import ZipLayout
+from pudl_archiver.utils import is_html_file
+
+BASE_URL = "https://www.eia.gov/consumption/residential/data/"
+
+
+@dataclass(frozen=True)
+class TabInfo:
+    """Information needed to archive the links in a tab."""
+
+    url: str
+    name: str
+    year: int
+
+
+class EiaRECSArchiver(AbstractDatasetArchiver):
+    """EIA RECS archiver."""
+
+    name = "eiarecs"
+    base_url = "https://www.eia.gov/consumption/residential/data/2020/"
+
+    async def get_resources(self) -> ArchiveAwaitable:
+        """Download EIA-RECS resources.
+
+        Looks in the "data" dropdown in the navbar for links to each year.
+        """
+        soup = await self.get_soup(self.base_url)
+        years = soup.select("div.subnav div.dat_block a")
+        numbered_years = [y for y in years if y.text.strip().lower() != "previous"]
+
+        for year in numbered_years:
+            if self.valid_year(year.text.strip()):
+                yield self.__get_year_resources(
+                    url=urljoin(self.base_url, year["href"]),
+                    year=int(year.text.strip()),
+                )
+
+    async def __get_year_resources(self, url: str, year: int) -> ResourceInfo:
+        """Download all data files for a year.
+
+        Finds links to all available tabs, then dispatches each tab to a
+        handler, which downloads content from the tabs and adds it to the
+        year's zip archive.
+
+        Tab handlers are mostly the same for each tab across the years, but
+        there is an ability to add exceptions when necessary.
+
+        Each year's actual forms are also archived - these are mostly not
+        linked from the tabs themselves, so we go to the main survey page and
+        download them there.
+
+        Args:
+            url: a string that represents the base page for this year
+            year: the actual year number we are archiving
+
+        Returns:
+            ResourceInfo: information about this year's zip file & its contents.
+        """
+        self.logger.info(f"Starting {year}")
+
+        tab_infos = await self.__select_tabs(url)
+
+        tab_handlers_overrides = {"methodology": {2009: self.__skip, 2015: self.__skip}}
+
+        zip_path = self.download_directory / f"eiarecs-{year}.zip"
+        paths_within_archive = []
+        for tab in tab_infos:
+            tab_handler = tab_handlers_overrides.get(tab.name, {}).get(
+                tab.year, self.__get_tab_html_and_links
+            )
+            paths_within_archive += await tab_handler(tab_info=tab, zip_path=zip_path)
+
+        self.logger.info(f"Looking for original forms for {year}")
+        original_forms_within_archive = await self.__get_original_forms(year, zip_path)
+
+        self.logger.info(f"Got original forms for {year}, returning ResourceInfo list.")
+        return ResourceInfo(
+            local_path=zip_path,
+            partitions={"year": year},
+            layout=ZipLayout(
+                file_paths=paths_within_archive + original_forms_within_archive
+            ),
+        )
+
+    async def __add_links_to_archive(
+        self, url_paths: dict[str, str], zip_path: Path
+    ) -> list[str]:
+        """Download and add link contents to a zipfile.
+
+        Skips links that lead to HTML content since these are usually broken links.
+
+        Args:
+            url_paths: mapping from URLs to the filenames we want them to have
+                in the zip.
+            zip_path: path to the archive
+
+        Returns:
+            list[str]: the filepaths, relative to the archive root, that we
+                just added.
+        """
+        data_paths_in_archive = []
+        for link, output_filename in url_paths.items():
+            download_path = self.download_directory / output_filename
+            self.logger.debug(f"Fetching {link} to {download_path}")
+            await self.download_file(link, download_path, timeout=120)
+            with download_path.open("rb") as f:
+                # TODO 2025-02-04: check html-ness against the suffix... if we
+                # have a php/html/cfm/etc. we probably actually *do* want the
+                # html file.
+                if is_html_file(f):
+                    self.logger.info(f"{link} was HTML file - skipping.")
+                    continue
+                self.add_to_archive(
+                    zip_path=zip_path,
+                    filename=output_filename,
+                    blob=f,
+                )
+                self.logger.debug(f"Added {link} to {zip_path} as {output_filename}")
+                data_paths_in_archive.append(output_filename)
+            download_path.unlink()
+        return data_paths_in_archive
+
+    async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]:
+        """Get the data files for a single tab.
+
+        First, gets a list of all of the <a> tags within the tab contents which have an href attribute.
+
+        These tag objects have the HTML attrs accessible as if they were dictionaries - href, src, etc.
+
+        They also have some Python attributes of their own that you can read: text, contents, children, etc.
+
+        See https://beautiful-soup-4.readthedocs.io/en/latest/#tag for details.
+        """
+        soup = await self.get_soup(tab_info.url)
+        links_in_tab = soup.select("div.tab-contentbox a[href]")
+        log_scope = f"{tab_info.year}:{tab_info.name}"
+        self.logger.info(f"{log_scope}: Found {len(links_in_tab)} links")
+
+        links_filtered = [
+            link
+            for link in links_in_tab
+            if not (
+                "mailto" in link["href"].lower() or "all tables" in link.text.lower()
+            )
+        ]
+
+        self.logger.info(f"{log_scope}: Found {len(links_filtered)} relevant links")
+
+        resolved_links = [
+            urljoin(tab_info.url, link["href"]) for link in links_filtered
+        ]
+        links_with_filenames = {
+            link: f"eiarecs-{tab_info.year}-{tab_info.name}-{self.__get_filename_from_link(link)}"
+            for link in resolved_links
+        }
+
+        data_paths = await self.__add_links_to_archive(
+            links_with_filenames, zip_path=zip_path
+        )
+
+        self.logger.info(
+            f"{log_scope}: Added {len(links_with_filenames)} links to archive"
+        )
+
+        return data_paths
+
+    async def __get_tab_html_and_links(
+        self, tab_info: TabInfo, zip_path: Path
+    ) -> list[str]:
+        """Get the data files in the tab, *and* get the tab content itself.
+
+        First, get all the links within the tab that aren't HTML files and
+        aren't mailtos.
+
+        Then, gets the entire HTML contents of div.tab-contentbox, which
+        contains the tab contents.
+
+        Then, makes a new HTML document with an html and a body tag, and shoves
+        the old tab contents in there.
+
+        This makes a new HTML file that can be opened by one's browser and
+        includes the tab's contents - but any links/images will not work.
+        """
+        log_scope = f"{tab_info.year}:{tab_info.name}"
+        self.logger.info(f"{log_scope}: Getting links in tab")
+        links = await self.__get_tab_links(tab_info=tab_info, zip_path=zip_path)
+
+        soup = await self.get_soup(tab_info.url)
+        tab_content = soup.select_one("div.tab-contentbox")
+        self.logger.info(f"{log_scope}: Got {len(tab_content)} bytes of tab content")
+        html = soup.new_tag("html")
+        body = soup.new_tag("body")
+        html.append(body)
+        body.append(tab_content)
+        # TODO 2025-02-03: consider using some sort of html-to-pdf converter here.
+        # use html-sanitizer or something before feeding it into pdf.
+
+        filename = f"eiarecs-{tab_info.year}-{tab_info.name}-tab-contents.html"
+        self.add_to_archive(
+            zip_path=zip_path,
+            filename=filename,
+            blob=BytesIO(html.prettify().encode("utf-8")),
+        )
+        self.logger.info(f"{log_scope}: Added html to {zip_path} under {filename}")
+        return links + [filename]
+
+    async def __get_original_forms(self, year: int, zip_path: Path) -> list[str]:
+        """Get the survey forms that were used to collect the data.
+
+        These are all on the same page, which is different from the yearly RECS
+        archive pages, so we do this separately from all the tab content above.
+        """
+        forms_url = "https://www.eia.gov/survey/"
+        soup = await self.get_soup(forms_url)
+        all_links = soup.select("#eia-457 div.expand-collapse-content a[href]")
+        links_filtered = [
+            link for link in all_links if f"/archive/{year}" in link["href"]
+        ]
+
+        resolved_links = [urljoin(forms_url, link["href"]) for link in links_filtered]
+
+        links_with_filenames = {
+            link: f"eiarecs-{year}-form-{self.__get_filename_from_link(link)}"
+            for link in resolved_links
+        }
+
+        return await self.__add_links_to_archive(
+            links_with_filenames, zip_path=zip_path
+        )
+
+    def __get_filename_from_link(self, url: str) -> str:
+        filepath = Path(urlparse(url).path)
+        stem = re.sub(r"\W+", "-", filepath.stem)
+        return f"{stem}{filepath.suffix}".lower()
+
+    async def __select_tabs(self, url: str) -> set[TabInfo]:
+        """Get the clickable tab links from the EIA RECS page layout."""
+
+        async def get_unselected_tabs(url):
+            soup = await self.get_soup(url)
+            unselected_tabs = soup.select("#tab-container a")
+            year = int(re.search(r"\d{4}", url)[0])
+            return {
+                TabInfo(
+                    url=urljoin(url, tab["href"]),
+                    name=re.sub(r"\W+", "-", tab.text.strip()).lower(),
+                    year=year,
+                )
+                for tab in unselected_tabs
+            }
+
+        first_unselected_tabs = await get_unselected_tabs(url)
+        another_tab_url = next(iter(first_unselected_tabs)).url
+        next_unselected_tabs = await get_unselected_tabs(another_tab_url)
+        return first_unselected_tabs.union(next_unselected_tabs)
+
+    async def __skip(self, **kwargs) -> list[str]:
+        return []
diff --git a/src/pudl_archiver/archivers/validate.py b/src/pudl_archiver/archivers/validate.py
index e51af7b7..98491a82 100644
--- a/src/pudl_archiver/archivers/validate.py
+++ b/src/pudl_archiver/archivers/validate.py
@@ -12,7 +12,7 @@
 from pydantic import BaseModel
 
 from pudl_archiver.frictionless import DataPackage, Resource, ZipLayout
-from pudl_archiver.utils import Url
+from pudl_archiver.utils import Url, is_html_file
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -277,7 +277,7 @@ def _process_resource_diffs(
     return [*changed_resources, *created_resources, *deleted_resources]
 
 
-def _validate_file_type(path: Path, buffer: BytesIO) -> bool:
+def _validate_file_type(path: Path, buffer: BytesIO) -> bool:  # noqa:C901
     """Check that file appears valid based on extension."""
     extension = path.suffix
 
@@ -293,12 +293,29 @@ def _validate_file_type(path: Path, buffer: BytesIO) -> bool:
     if extension == ".xml" or extension == ".xbrl" or extension == ".xsd":
         return _validate_xml(buffer)
 
+    if extension == ".pdf":
+        header = buffer.read(5)
+        buffer.seek(0)
+        return header.startswith(b"%PDF-")
+
     if extension == ".parquet":
         return _validate_parquet(buffer)
 
     if extension == ".csv":
         return _validate_csv(buffer)
 
+    if extension == ".xls":
+        header = buffer.read(8)
+        buffer.seek(0)
+        # magic bytes for old-school xls file
+        return header.hex() == "d0cf11e0a1b11ae1"
+
+    if extension == ".html":
+        return is_html_file(buffer)
+
+    if extension == ".txt":
+        return _validate_text(buffer)
+
     logger.warning(f"No validations defined for files of type: {extension} - {path}")
     return True
 
@@ -326,3 +343,18 @@ def _validate_parquet(buffer: BytesIO) -> bool:
         return True
     except (pa.lib.ArrowInvalid, pa.lib.ArrowException):
         return False
+
+
+def _validate_text(buffer: BytesIO) -> bool:
+    """Try decoding as UTF-8, then as Latin-1."""
+    sample = buffer.read(1_000_000)
+    buffer.seek(0)
+    try:
+        sample.decode(encoding="utf-8")
+        return True
+    except UnicodeDecodeError:
+        try:
+            sample.decode(encoding="latin-1")
+            return True
+        except UnicodeDecodeError:
+            return False
diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml
index 8cedd6ba..17c7504f 100644
--- a/src/pudl_archiver/package_data/zenodo_doi.yaml
+++ b/src/pudl_archiver/package_data/zenodo_doi.yaml
@@ -40,6 +40,8 @@ eiaaeo:
 eiacbecs:
   production_doi: 10.5281/zenodo.14782474
   sandbox_doi: 10.5072/zenodo.161000
+eiarecs:
+  production_doi: 10.5281/zenodo.14783267
 eia_bulk_elec:
   production_doi: 10.5281/zenodo.7067366
   sandbox_doi: 10.5072/zenodo.2356
diff --git a/src/pudl_archiver/utils.py b/src/pudl_archiver/utils.py
index 17c30915..1815ac94 100644
--- a/src/pudl_archiver/utils.py
+++ b/src/pudl_archiver/utils.py
@@ -6,6 +6,7 @@
 import zipfile
 from collections.abc import Awaitable, Callable
 from hashlib import md5
+from io import BytesIO
 from pathlib import Path
 
 import aiohttp
@@ -145,3 +146,11 @@ def compute_md5(file_path: UPath) -> str:
             hash_md5.update(chunk)
 
     return hash_md5.hexdigest()
+
+
+def is_html_file(fileobj: BytesIO) -> bool:
+    """Check the first 30 bytes of a file to see if there's an HTML header hiding in there."""
+    fileobj.seek(0)
+    header = fileobj.read(30).lower().strip()
+    fileobj.seek(0)
+    return b"<!doctype html" in header