diff --git a/src/pudl_archiver/archivers/eia/eiamecs.py b/src/pudl_archiver/archivers/eia/eiamecs.py index c3ef0fbf..72220480 100644 --- a/src/pudl_archiver/archivers/eia/eiamecs.py +++ b/src/pudl_archiver/archivers/eia/eiamecs.py @@ -8,46 +8,110 @@ ArchiveAwaitable, ResourceInfo, ) +from pudl_archiver.frictionless import ZipLayout BASE_URL = "https://www.eia.gov/consumption/manufacturing/data" logger = logging.getLogger(f"catalystcoop.{__name__}") +TABLE_LINK_PATTERNS: dict[str | int, str] = { + "recent": r"(RSE|)[Tt]able(\d{1,2}|\d{1.1})_(\d{1,2})(.xlsx|.xls)", + 2002: r"(RSE|)[Tt]able(\d{1,2}).(\d{1,2})_\d{1,2}(.xlsx|.xls)", + # These earlier years the pattern is functional but not actually very informative. + # so we will just use the original name by making the whole pattern a match + 1998: r"((d|e)\d{2}([a-z]\d{1,2})_(\d{1,2})(.xlsx|.xls))", + 1994: r"((rse|)m\d{2}_(\d{2})([a-d]|)(.xlsx|.xls))", + 1991: r"((rse|)mecs(\d{2})([a-z]|)(.xlsx|.xls))", +} +"""Dictionary of years or "recent" as keys and table link patterns as values. + +From 2006 and forward the link pattern is the same but all of the older years +have bespoke table link patterns. The groups to match in the regex patterns +will be used to rename the files for the archives. The order of those match +groups indicate various things: + +* first group: whether the file contains only Relative Standard Errors (RSE) +* second group: the major table number +* third group: the minor table number +* forth group: the file extension + +The years from 1998 and back have table link patterns that could be used in this +same format with 4 match groups, but the major and minor table numbers are not +actually stored in the file name. So for these older years we've turned the whole +pattern into a group and use that (the original file name) as the stored name in +the archive. +""" + class EiaMECSArchiver(AbstractDatasetArchiver): """EIA MECS archiver.""" name = "eiamecs" + concurrency_limit = 5 # Number of files to concurrently download async def get_resources(self) -> ArchiveAwaitable: """Download EIA-MECS resources.""" - for year in [2018]: - yield self.get_year_resources(year) + years_url = "https://www.eia.gov/consumption/data.php#mfg" + year_link_pattern = re.compile(r"(manufacturing/data/)(\d{4})/$") + for link in await self.get_hyperlinks(years_url, year_link_pattern): + match = year_link_pattern.search(link) + year = match.groups()[1] + if self.valid_year(year): + yield self.get_year_resources(year) async def get_year_resources(self, year: int) -> list[ResourceInfo]: """Download all excel tables for a year.""" - table_link_pattern = re.compile(r"[Tt]able(\d{1,2})_(\d{1,2}).xlsx") + logger.info(f"Attempting to find resources for: {year}") + data_paths_in_archive = set() + year_url = f"{BASE_URL}/{year}" + zip_path = self.download_directory / f"eiamecs-{year}.zip" + max_old_year = max( + [year for year in TABLE_LINK_PATTERNS if isinstance(year, int)] + ) + if int(year) > max_old_year: + table_link_pattern = re.compile(TABLE_LINK_PATTERNS["recent"]) + else: + table_link_pattern = re.compile(TABLE_LINK_PATTERNS[int(year)]) # Loop through all download links for tables - tables = [] - year_url = f"{BASE_URL}/{year}" for table_link in await self.get_hyperlinks(year_url, table_link_pattern): table_link = f"{year_url}/{table_link}" logger.info(f"Fetching {table_link}") - # Get table major/minor number from links + # We are going to rename the files in a standard format by extracting + # patterns from the table_link_pattern + # From 1998 and before there are a bunch of letters in the file names + # in patterns that are probably parsable somehow, but for now we are + # just going to keep the original file names match = table_link_pattern.search(table_link) - major_num, minor_num = match.group(1), match.group(2) - - # Download file - download_path = ( - self.download_directory - / f"eia-mecs-{year}-table-{major_num}-{minor_num}.xlsx" - ) - await self.download_zipfile(table_link, download_path) - - tables.append( - ResourceInfo( - local_path=download_path, - partitions={"year": year, "table": f"{major_num}_{minor_num}"}, + filename = match.group(1) + if int(year) > 1998: + is_rse = match.group(1) + # there are several ways the they indicate that the files are + # "data" vs "rse". we will add this to the end of the file name + # but only for rse bc for many years data and the rse are together + rse_map = {"": "", "d": "", "RSE": "-rse", "e": "-rse"} + rse = rse_map[is_rse] + major_num = match.group(2) + minor_num = match.group(3) + extension = match.group(4) + # Download filename + filename = ( + f"eia-mecs-{year}-table-{major_num}-{minor_num}{rse}{extension}" ) + download_path = self.download_directory / filename + await self.download_file(table_link, download_path) + self.add_to_archive( + zip_path=zip_path, + filename=filename, + blob=download_path.open("rb"), ) - return tables + data_paths_in_archive.add(filename) + # Don't want to leave multiple giant CSVs on disk, so delete + # immediately after they're safely stored in the ZIP + download_path.unlink() + + resource_info = ResourceInfo( + local_path=zip_path, + partitions={"year": year}, + layout=ZipLayout(file_paths=data_paths_in_archive), + ) + return resource_info diff --git a/src/pudl_archiver/cli.py b/src/pudl_archiver/cli.py index 352211e1..436c3946 100644 --- a/src/pudl_archiver/cli.py +++ b/src/pudl_archiver/cli.py @@ -29,7 +29,7 @@ def parse_main(args=None): nargs="*", help="Years to download data for. Supported datasets: censusdp1tract, censuspep, " "eia176, eia191, eia757a, eia860, eia860m, eia861, eia923, eia930, eia_bulk_elec, " - "eiaaeo, eiawater, epacamd_eia, epacems, ferc1, ferc2, ferc6, ferc60, ferc714, " + "eiaaeo, eiamecs, eiawater, epacamd_eia, epacems, ferc1, ferc2, ferc6, ferc60, ferc714, " "mshamines, nrelatb, phmsagas", type=int, ) diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml index ceece57e..e77f2e8f 100644 --- a/src/pudl_archiver/package_data/zenodo_doi.yaml +++ b/src/pudl_archiver/package_data/zenodo_doi.yaml @@ -37,6 +37,9 @@ eiaaeo: eia_bulk_elec: production_doi: 10.5281/zenodo.7067366 sandbox_doi: 10.5072/zenodo.2356 +eiamecs: + production_doi: 10.5281/zenodo.14749820 + sandbox_doi: 10.5072/zenodo.158873 eiawater: production_doi: 10.5281/zenodo.7683135 sandbox_doi: 10.5072/zenodo.3160