Skip to content

Commit

Permalink
Merge branch 'main' into latest-epa-eia-crosswalk
Browse files Browse the repository at this point in the history
  • Loading branch information
zaneselvans authored Jan 31, 2025
2 parents 7c7b1f9 + abe25f9 commit ff63d21
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 2 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/run-archiver.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
inputs:
datasets:
description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").'
default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"'
required: true
type: string
create_github_issue:
Expand All @@ -26,7 +26,7 @@ jobs:
strategy:
matrix:
# Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here.
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }}
fail-fast: false
runs-on: ubuntu-latest
permissions:
Expand Down
120 changes: 120 additions & 0 deletions src/pudl_archiver/archivers/eia/eiacbecs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
"""Archive EIA Commercial Buildings Energy Consumption Survey (CBECS)."""

import re
from pathlib import Path
from urllib.parse import urljoin

from pudl_archiver.archivers.classes import (
AbstractDatasetArchiver,
ArchiveAwaitable,
ResourceInfo,
)
from pudl_archiver.frictionless import ZipLayout

BASE_URL = "https://www.eia.gov/consumption/commercial/data/"


class EiaCbecsArchiver(AbstractDatasetArchiver):
"""EIA CBECS archiver."""

name = "eiacbecs"

async def get_resources(self) -> ArchiveAwaitable:
"""Download EIA-CBECS resources."""
# we use this link and pattern to determine which years of CBECS data exists,
# but these base year links are only a portion of the view links so we
# construct the full links within get_year_resources
link_pattern = re.compile(r"commercial/data/(\d{4})/$", re.IGNORECASE)
for link in await self.get_hyperlinks(BASE_URL, link_pattern):
match = link_pattern.search(link)
year = match.group(1)
if int(year) > 2018:
raise self.logger.warning(
f"There is a new year of data: {year}! This will almost certainly "
"require some updating of this archive."
)
yield self.get_year_resources(year)

async def get_year_resources(self, year: int) -> list[ResourceInfo]:
"""Download all files from all views for a year."""
data_paths_in_archive = set()
zip_path = self.download_directory / f"eiacbecs-{year}.zip"
char_and_cons_pattern = (
rf"(?:{year}|archive)(?:.*)/([a-z,\d]{{1,8}})(.xls|.xlsx|.pdf)$"
)
data_view_patterns = {
"characteristics": re.compile(char_and_cons_pattern),
"consumption": re.compile(char_and_cons_pattern),
# some of the mircodata links are like csv/file01.csv which doesn't include
# the year or archive. instead of adding a null option for that first group
# we add a whole new pattern for these two years because if we don't
# we'd pick up some of the 2018 pdf files that are on the right hand side
# of these pages
"microdata": re.compile(
rf"(?:{year}/|archive/)(?:xls|pdf|csv|sas)/(.*)(.xls|.xlsx|.pdf|.csv|.exe|.zip)$"
if year not in ["2003", "1999"]
else r"^(?:csv|pdf)/(.*)(.csv|.pdf)$"
),
# the most recent cbecs doesn't a year or archive in the methodology links
# BUT there are almost always pdf files from 2018 that get caught up in
# these scrapers if we don't include year or archive. so we have a special
# 2018 pattern
"methodology": re.compile(
rf"(?:{year}|archive/pubs)(?:/pdf|)/(.*)(.pdf$)"
if year != "2018"
else r"/consumption/commercial(?:/data/2018|)/pdf/(.*)(.pdf)$"
),
}

for view, table_link_pattern in data_view_patterns.items():
year_url = f"{BASE_URL}{year}/index.php?view={view}"
for link in await self.get_hyperlinks(year_url, table_link_pattern):
match = table_link_pattern.search(link)
unique_id = (
match.group(1).replace("_", "-").replace(" ", "-").lower().strip()
)
file_extension = match.group(2)
filename = f"eiacbecs-{year}-{view}-{unique_id}{file_extension}"
file_url = urljoin(year_url, link)
download_path = self.download_directory / filename
await self.download_file(file_url, download_path)
# there are a small-ish handful of files who's links redirect to the main
# cbecs page. presumably its a broken link. we want to skip those files,
# so we are going to check to see if the doctype of the bytes of the file
# are html. if so we move on, otherwise add to the archive
with Path.open(download_path, "rb") as f:
first_bytes = f.read(20)
if b"html" in first_bytes.lower().strip():
self.logger.warning(
f"Skipping {file_url} because it appears to be a redirect/html page."
)
pass
else:
self.add_to_archive(
zip_path=zip_path,
filename=filename,
blob=download_path.open("rb"),
)
data_paths_in_archive.add(filename)
# Don't want to leave multiple files on disk, so delete
# immediately after they're safely stored in the ZIP
download_path.unlink()
# Check if all of the views found any links
year_has_all_views: dict[str, bool] = {
view: any(fn for fn in data_paths_in_archive if view in fn)
for view in data_view_patterns
}
views_without_files = [
view for (view, has_files) in year_has_all_views.items() if not has_files
]
if views_without_files:
raise AssertionError(
"We expect all years of EIA CBECS to have some data from all four "
f"views, but we found these views without files for {year}: {views_without_files}"
)

return ResourceInfo(
local_path=zip_path,
partitions={"year": year},
layout=ZipLayout(file_paths=data_paths_in_archive),
)
3 changes: 3 additions & 0 deletions src/pudl_archiver/package_data/zenodo_doi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ eia930:
eiaaeo:
production_doi: 10.5281/zenodo.10838488
sandbox_doi: 10.5072/zenodo.37746
eiacbecs:
production_doi: 10.5281/zenodo.14782474
sandbox_doi: 10.5072/zenodo.161000
eia_bulk_elec:
production_doi: 10.5281/zenodo.7067366
sandbox_doi: 10.5072/zenodo.2356
Expand Down

0 comments on commit ff63d21

Please sign in to comment.