Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make a multi-year EIA MECS archive #542

Merged
merged 9 commits into from
Jan 29, 2025
63 changes: 37 additions & 26 deletions src/pudl_archiver/archivers/eia/eiamecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,40 @@
BASE_URL = "https://www.eia.gov/consumption/manufacturing/data"
logger = logging.getLogger(f"catalystcoop.{__name__}")

TABLE_LINK_PATTERNS = {
"recent": r"(RSE|)[Tt]able(\d{1,2}|\d{1.1})_(\d{1,2})(.xlsx|.xls)",
2002: r"(RSE|)[Tt]able(\d{1,2}).(\d{1,2})_\d{1,2}(.xlsx|.xls)",
# These earlier years the pattern is functional but not actually very informative.
# so we will just use the original name by making the whole pattern a match
1998: r"((d|e)\d{2}([a-z]\d{1,2})_(\d{1,2})(.xlsx|.xls))",
1994: r"((rse|)m\d{2}_(\d{2})([a-d]|)(.xlsx|.xls))",
1991: r"((rse|)mecs(\d{2})([a-z]|)(.xlsx|.xls))",
}
"""Dictionary of years or "latest" as keys and table link patterns as values.
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved

From 2006 and forward the link pattern is the same but all of the older years
have bespoke table link patterns. The groups to match in the regex patterns
will be used to rename the files for the archives. The order of those match
groups indicate various things:

* first group: whether the file contains only Relative Standard Errors (RSE)
* second group: the major table number
* third group: the minor table number
* forth group: the file extension

The years from 1998 and back have table link patterns that could be used in this
same format with 4 match groups, but the major and minor table numbers are not
actually stored in the file name. So for these older years we've turned the whole
pattern into a group and use that (the original file name) as the stored name in
the archive.
"""


class EiaMECSArchiver(AbstractDatasetArchiver):
"""EIA MECS archiver."""

name = "eiamecs"
concurrency_limit = 5 # Number of files to concurrently download

async def get_resources(self) -> ArchiveAwaitable:
"""Download EIA-MECS resources."""
Expand All @@ -34,26 +63,13 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
data_paths_in_archive = set()
year_url = f"{BASE_URL}/{year}"
zip_path = self.download_directory / f"eiamecs-{year}.zip"
if int(year) >= 2006:
table_link_pattern = re.compile(
r"(RSE|)[Tt]able(\d{1,2}|\d{1.1})_(\d{1,2})(.xlsx|.xls)"
)
elif int(year) == 2002:
table_link_pattern = re.compile(
r"(RSE|)[Tt]able(\d{1,2}).(\d{1,2})_\d{1,2}(.xlsx|.xls)"
)
elif int(year) == 1998:
table_link_pattern = re.compile(
r"(d|e)\d{2}([a-z]\d{1,2})_(\d{1,2})(.xlsx|.xls)"
)
elif int(year) == 1994:
# These earlier years the pattern is functional but not actually that informative.
# so we will just use the original name by making the whole pattern a match
table_link_pattern = re.compile(
r"((rse|)m\d{2}_(\d{2})([a-d]|)(.xlsx|.xls))"
)
elif int(year) == 1991:
table_link_pattern = re.compile(r"((rse|)mecs(\d{2})([a-z]|)(.xlsx|.xls))")
max_old_year = max(
[year for year in TABLE_LINK_PATTERNS if isinstance(year, int)]
)
if int(year) > max_old_year:
table_link_pattern = re.compile(TABLE_LINK_PATTERNS["recent"])
else:
table_link_pattern = re.compile(TABLE_LINK_PATTERNS[int(year)])

# Loop through all download links for tables
for table_link in await self.get_hyperlinks(year_url, table_link_pattern):
Expand All @@ -71,12 +87,7 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
# there are several ways the they indicate that the files are
# "data" vs "rse". we will add this to the end of the file name
# but only for rse bc for many years data and the rse are together
rse_map = {
"": "",
"d": "",
"RSE": "-rse",
"e": "-rse",
}
rse_map = {"": "", "d": "", "RSE": "-rse", "e": "-rse"}
rse = rse_map[is_rse]
major_num = match.group(2)
minor_num = match.group(3)
Expand Down