From 44ea29207e8c921d86318442edd7dbf5fbc2b43a Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 24 Jan 2025 11:50:08 -0500 Subject: [PATCH 1/7] wip draft of multi-year mecs --- src/pudl_archiver/archivers/eia/eiamecs.py | 39 ++++++++++++------- .../package_data/zenodo_doi.yaml | 2 + 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiamecs.py b/src/pudl_archiver/archivers/eia/eiamecs.py index c3ef0fbf..8638da29 100644 --- a/src/pudl_archiver/archivers/eia/eiamecs.py +++ b/src/pudl_archiver/archivers/eia/eiamecs.py @@ -8,6 +8,7 @@ ArchiveAwaitable, ResourceInfo, ) +from pudl_archiver.frictionless import ZipLayout BASE_URL = "https://www.eia.gov/consumption/manufacturing/data" logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -20,16 +21,22 @@ class EiaMECSArchiver(AbstractDatasetArchiver): async def get_resources(self) -> ArchiveAwaitable: """Download EIA-MECS resources.""" - for year in [2018]: + years_url = "https://www.eia.gov/consumption/data.php#mfg" + year_link_pattern = re.compile(r"(manufacturing/data/)(\d{4})/$") + for link in await self.get_hyperlinks(years_url, year_link_pattern): + match = year_link_pattern.search(link) + year = match.groups()[1] yield self.get_year_resources(year) async def get_year_resources(self, year: int) -> list[ResourceInfo]: """Download all excel tables for a year.""" + logger.info(f"Attempting to find resources for: {year}") table_link_pattern = re.compile(r"[Tt]able(\d{1,2})_(\d{1,2}).xlsx") # Loop through all download links for tables - tables = [] + data_paths_in_archive = set() year_url = f"{BASE_URL}/{year}" + zip_path = self.download_directory / f"eiamecs-{year}.zip" for table_link in await self.get_hyperlinks(year_url, table_link_pattern): table_link = f"{year_url}/{table_link}" logger.info(f"Fetching {table_link}") @@ -38,16 +45,20 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: major_num, minor_num = match.group(1), match.group(2) # Download file - download_path = ( - self.download_directory - / f"eia-mecs-{year}-table-{major_num}-{minor_num}.xlsx" + filename = f"eia-mecs-{year}-table-{major_num}-{minor_num}.xlsx" + download_path = self.download_directory / filename + await self.download_file(table_link, download_path) + self.add_to_archive( + zip_path=zip_path, + filename=filename, + blob=download_path.open("rb"), ) - await self.download_zipfile(table_link, download_path) - - tables.append( - ResourceInfo( - local_path=download_path, - partitions={"year": year, "table": f"{major_num}_{minor_num}"}, - ) - ) - return tables + data_paths_in_archive.add(filename) + # Don't want to leave multiple giant CSVs on disk, so delete + # immediately after they're safely stored in the ZIP + download_path.unlink() + return ResourceInfo( + local_path=zip_path, + partitions={"year": year}, + layout=ZipLayout(file_paths=data_paths_in_archive), + ) diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml index b8a61a53..b11a6cf3 100644 --- a/src/pudl_archiver/package_data/zenodo_doi.yaml +++ b/src/pudl_archiver/package_data/zenodo_doi.yaml @@ -34,6 +34,8 @@ eiaaeo: eia_bulk_elec: production_doi: 10.5281/zenodo.7067366 sandbox_doi: 10.5072/zenodo.2356 +eiamecs: + sandbox_doi: 10.5072/zenodo.149504 eiawater: production_doi: 10.5281/zenodo.7683135 sandbox_doi: 10.5072/zenodo.3160 From 046806a33e1cf859819be0845800808cdd8f729b Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 24 Jan 2025 15:46:52 -0500 Subject: [PATCH 2/7] still wip mecs file name maddness --- src/pudl_archiver/archivers/eia/eiamecs.py | 37 ++++++++++++++++++---- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiamecs.py b/src/pudl_archiver/archivers/eia/eiamecs.py index 8638da29..38df4892 100644 --- a/src/pudl_archiver/archivers/eia/eiamecs.py +++ b/src/pudl_archiver/archivers/eia/eiamecs.py @@ -26,26 +26,47 @@ async def get_resources(self) -> ArchiveAwaitable: for link in await self.get_hyperlinks(years_url, year_link_pattern): match = year_link_pattern.search(link) year = match.groups()[1] - yield self.get_year_resources(year) + if int(year) >= 1994: + yield self.get_year_resources(year) async def get_year_resources(self, year: int) -> list[ResourceInfo]: """Download all excel tables for a year.""" logger.info(f"Attempting to find resources for: {year}") - table_link_pattern = re.compile(r"[Tt]able(\d{1,2})_(\d{1,2}).xlsx") - - # Loop through all download links for tables data_paths_in_archive = set() year_url = f"{BASE_URL}/{year}" zip_path = self.download_directory / f"eiamecs-{year}.zip" + if int(year) >= 2006: + table_link_pattern = re.compile( + r"(RSE|)[Tt]able(\d{1,2}|\d{1.1})_(\d{1,2})(.xlsx|.xls)" + ) + elif int(year) == 2002: + table_link_pattern = re.compile( + r"(RSE|)[Tt]able(\d{1,2}).(\d{1,2})_(\d{1,2})(.xlsx|.xls)" + ) + elif int(year) == 1998: + table_link_pattern = re.compile( + r"(d|e)(\d{2})[a-z](\d{1,2})_(\d{1,2})(.xlsx|.xls)" + ) + elif int(year) == 1994: + table_link_pattern = re.compile(r"(m|)(\d{2})_(\d{2})([a-d]|)(.xlsx|.xls)") + + # Loop through all download links for tables for table_link in await self.get_hyperlinks(year_url, table_link_pattern): table_link = f"{year_url}/{table_link}" logger.info(f"Fetching {table_link}") # Get table major/minor number from links match = table_link_pattern.search(table_link) - major_num, minor_num = match.group(1), match.group(2) + # this is actually always first + is_rse = match.group(1) + is_rse = f"-{str.lower(is_rse)}" if is_rse != "" else "" + major_num = match.group(2) + minor_num = match.group(3) + extension = match.group(4) # Download file - filename = f"eia-mecs-{year}-table-{major_num}-{minor_num}.xlsx" + filename = ( + f"eia-mecs-{year}-table-{major_num}-{minor_num}{is_rse}{extension}" + ) download_path = self.download_directory / filename await self.download_file(table_link, download_path) self.add_to_archive( @@ -57,8 +78,10 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: # Don't want to leave multiple giant CSVs on disk, so delete # immediately after they're safely stored in the ZIP download_path.unlink() - return ResourceInfo( + + resource_info = ResourceInfo( local_path=zip_path, partitions={"year": year}, layout=ZipLayout(file_paths=data_paths_in_archive), ) + return resource_info From 309b1b6947d670fa591a352a187759b41a09b981 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 24 Jan 2025 17:01:50 -0500 Subject: [PATCH 3/7] give up on renaming the old files --- src/pudl_archiver/archivers/eia/eiamecs.py | 47 ++++++++++++++-------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiamecs.py b/src/pudl_archiver/archivers/eia/eiamecs.py index 38df4892..2b279cc5 100644 --- a/src/pudl_archiver/archivers/eia/eiamecs.py +++ b/src/pudl_archiver/archivers/eia/eiamecs.py @@ -26,8 +26,7 @@ async def get_resources(self) -> ArchiveAwaitable: for link in await self.get_hyperlinks(years_url, year_link_pattern): match = year_link_pattern.search(link) year = match.groups()[1] - if int(year) >= 1994: - yield self.get_year_resources(year) + yield self.get_year_resources(year) async def get_year_resources(self, year: int) -> list[ResourceInfo]: """Download all excel tables for a year.""" @@ -41,32 +40,48 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: ) elif int(year) == 2002: table_link_pattern = re.compile( - r"(RSE|)[Tt]able(\d{1,2}).(\d{1,2})_(\d{1,2})(.xlsx|.xls)" + r"(RSE|)[Tt]able(\d{1,2}).(\d{1,2})_\d{1,2}(.xlsx|.xls)" ) elif int(year) == 1998: table_link_pattern = re.compile( - r"(d|e)(\d{2})[a-z](\d{1,2})_(\d{1,2})(.xlsx|.xls)" + r"((d|e)\d{2}[a-z](\d{1,2})_(\d{1,2})(.xlsx|.xls))" ) elif int(year) == 1994: - table_link_pattern = re.compile(r"(m|)(\d{2})_(\d{2})([a-d]|)(.xlsx|.xls)") + table_link_pattern = re.compile( + r"((rse|)m\d{2}_(\d{2})([a-d]|)(.xlsx|.xls))" + ) + elif int(year) == 1991: + table_link_pattern = re.compile(r"((rse|)mecs(\d{2})([a-z])(.xlsx|.xls))") # Loop through all download links for tables for table_link in await self.get_hyperlinks(year_url, table_link_pattern): table_link = f"{year_url}/{table_link}" logger.info(f"Fetching {table_link}") - # Get table major/minor number from links + # From 1998 and before there are a bunch of letters in the file names + # in patterns that are probably parsable somehow, but for now we are + # just going to keep the original file names match = table_link_pattern.search(table_link) - # this is actually always first - is_rse = match.group(1) - is_rse = f"-{str.lower(is_rse)}" if is_rse != "" else "" - major_num = match.group(2) - minor_num = match.group(3) - extension = match.group(4) + filename = match.group(1) + if int(year) > 1998: + # Get table major/minor number from links - # Download file - filename = ( - f"eia-mecs-{year}-table-{major_num}-{minor_num}{is_rse}{extension}" - ) + # this is actually always first + is_rse = match.group(1) + is_rse = match.group(1) + rse_map = { + "": "", + "d": "", + "RSE": "-rse", + "e": "-rse", + } + rse = rse_map[is_rse] + major_num = match.group(2) + minor_num = match.group(3) + extension = match.group(4) + # Download file + filename = ( + f"eia-mecs-{year}-table-{major_num}-{minor_num}{rse}{extension}" + ) download_path = self.download_directory / filename await self.download_file(table_link, download_path) self.add_to_archive( From 56b1b299be1dac89c90b49d8f381a33c3e9bce4a Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 24 Jan 2025 17:23:00 -0500 Subject: [PATCH 4/7] light cleanup --- src/pudl_archiver/archivers/eia/eiamecs.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiamecs.py b/src/pudl_archiver/archivers/eia/eiamecs.py index 2b279cc5..009e9e7f 100644 --- a/src/pudl_archiver/archivers/eia/eiamecs.py +++ b/src/pudl_archiver/archivers/eia/eiamecs.py @@ -44,30 +44,33 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: ) elif int(year) == 1998: table_link_pattern = re.compile( - r"((d|e)\d{2}[a-z](\d{1,2})_(\d{1,2})(.xlsx|.xls))" + r"(d|e)\d{2}([a-z]\d{1,2})_(\d{1,2})(.xlsx|.xls)" ) elif int(year) == 1994: + # These earlier years the pattern is functional but not actually that informative. + # so we will just use the original name by making the whole pattern a match table_link_pattern = re.compile( r"((rse|)m\d{2}_(\d{2})([a-d]|)(.xlsx|.xls))" ) elif int(year) == 1991: - table_link_pattern = re.compile(r"((rse|)mecs(\d{2})([a-z])(.xlsx|.xls))") + table_link_pattern = re.compile(r"((rse|)mecs(\d{2})([a-z]|)(.xlsx|.xls))") # Loop through all download links for tables for table_link in await self.get_hyperlinks(year_url, table_link_pattern): table_link = f"{year_url}/{table_link}" logger.info(f"Fetching {table_link}") + # We are going to rename the files in a standard format by extracting + # patterns from the table_link_pattern # From 1998 and before there are a bunch of letters in the file names # in patterns that are probably parsable somehow, but for now we are # just going to keep the original file names match = table_link_pattern.search(table_link) filename = match.group(1) if int(year) > 1998: - # Get table major/minor number from links - - # this is actually always first - is_rse = match.group(1) is_rse = match.group(1) + # there are several ways the they indicate that the files are + # "data" vs "rse". we will add this to the end of the file name + # but only for rse bc for many years data and the rse are together rse_map = { "": "", "d": "", @@ -78,7 +81,7 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: major_num = match.group(2) minor_num = match.group(3) extension = match.group(4) - # Download file + # Download filename filename = ( f"eia-mecs-{year}-table-{major_num}-{minor_num}{rse}{extension}" ) From 96ead608e49e34f0743fde820fdc85df8544bce3 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Mon, 27 Jan 2025 10:29:20 -0500 Subject: [PATCH 5/7] don't try to rename 1998 and move table patterns to dict --- src/pudl_archiver/archivers/eia/eiamecs.py | 45 +++++++++------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiamecs.py b/src/pudl_archiver/archivers/eia/eiamecs.py index 009e9e7f..f4415cec 100644 --- a/src/pudl_archiver/archivers/eia/eiamecs.py +++ b/src/pudl_archiver/archivers/eia/eiamecs.py @@ -13,6 +13,17 @@ BASE_URL = "https://www.eia.gov/consumption/manufacturing/data" logger = logging.getLogger(f"catalystcoop.{__name__}") +TABLE_LINK_PATTERNS = { + "recent": r"(RSE|)[Tt]able(\d{1,2}|\d{1.1})_(\d{1,2})(.xlsx|.xls)", + 2002: r"(RSE|)[Tt]able(\d{1,2}).(\d{1,2})_\d{1,2}(.xlsx|.xls)", + # These earlier years the pattern is functional but not actually very informative. + # so we will just use the original name by making the whole pattern a match + 1998: r"((d|e)\d{2}([a-z]\d{1,2})_(\d{1,2})(.xlsx|.xls))", + 1994: r"((rse|)m\d{2}_(\d{2})([a-d]|)(.xlsx|.xls))", + 1991: r"((rse|)mecs(\d{2})([a-z]|)(.xlsx|.xls))", +} +"""Dictionary of """ + class EiaMECSArchiver(AbstractDatasetArchiver): """EIA MECS archiver.""" @@ -34,26 +45,13 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: data_paths_in_archive = set() year_url = f"{BASE_URL}/{year}" zip_path = self.download_directory / f"eiamecs-{year}.zip" - if int(year) >= 2006: - table_link_pattern = re.compile( - r"(RSE|)[Tt]able(\d{1,2}|\d{1.1})_(\d{1,2})(.xlsx|.xls)" - ) - elif int(year) == 2002: - table_link_pattern = re.compile( - r"(RSE|)[Tt]able(\d{1,2}).(\d{1,2})_\d{1,2}(.xlsx|.xls)" - ) - elif int(year) == 1998: - table_link_pattern = re.compile( - r"(d|e)\d{2}([a-z]\d{1,2})_(\d{1,2})(.xlsx|.xls)" - ) - elif int(year) == 1994: - # These earlier years the pattern is functional but not actually that informative. - # so we will just use the original name by making the whole pattern a match - table_link_pattern = re.compile( - r"((rse|)m\d{2}_(\d{2})([a-d]|)(.xlsx|.xls))" - ) - elif int(year) == 1991: - table_link_pattern = re.compile(r"((rse|)mecs(\d{2})([a-z]|)(.xlsx|.xls))") + max_old_year = max( + [year for year in TABLE_LINK_PATTERNS if isinstance(year, int)] + ) + if int(year) > max_old_year: + table_link_pattern = re.compile(TABLE_LINK_PATTERNS["recent"]) + else: + table_link_pattern = re.compile(TABLE_LINK_PATTERNS[int(year)]) # Loop through all download links for tables for table_link in await self.get_hyperlinks(year_url, table_link_pattern): @@ -71,12 +69,7 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: # there are several ways the they indicate that the files are # "data" vs "rse". we will add this to the end of the file name # but only for rse bc for many years data and the rse are together - rse_map = { - "": "", - "d": "", - "RSE": "-rse", - "e": "-rse", - } + rse_map = {"": "", "d": "", "RSE": "-rse", "e": "-rse"} rse = rse_map[is_rse] major_num = match.group(2) minor_num = match.group(3) From 0db9ab325ccf361d20b53942875edef4a8bb3ad4 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Mon, 27 Jan 2025 10:57:48 -0500 Subject: [PATCH 6/7] table link pattern docs and add concurrency limit bc errors --- src/pudl_archiver/archivers/eia/eiamecs.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/pudl_archiver/archivers/eia/eiamecs.py b/src/pudl_archiver/archivers/eia/eiamecs.py index f4415cec..6fe24c8e 100644 --- a/src/pudl_archiver/archivers/eia/eiamecs.py +++ b/src/pudl_archiver/archivers/eia/eiamecs.py @@ -22,13 +22,31 @@ 1994: r"((rse|)m\d{2}_(\d{2})([a-d]|)(.xlsx|.xls))", 1991: r"((rse|)mecs(\d{2})([a-z]|)(.xlsx|.xls))", } -"""Dictionary of """ +"""Dictionary of years or "latest" as keys and table link patterns as values. + +From 2006 and forward the link pattern is the same but all of the older years +have bespoke table link patterns. The groups to match in the regex patterns +will be used to rename the files for the archives. The order of those match +groups indicate various things: + +* first group: whether the file contains only Relative Standard Errors (RSE) +* second group: the major table number +* third group: the minor table number +* forth group: the file extension + +The years from 1998 and back have table link patterns that could be used in this +same format with 4 match groups, but the major and minor table numbers are not +actually stored in the file name. So for these older years we've turned the whole +pattern into a group and use that (the original file name) as the stored name in +the archive. +""" class EiaMECSArchiver(AbstractDatasetArchiver): """EIA MECS archiver.""" name = "eiamecs" + concurrency_limit = 5 # Number of files to concurrently download async def get_resources(self) -> ArchiveAwaitable: """Download EIA-MECS resources.""" From 0116d506495a83e1bdf2139dd6300e4445c4381b Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 29 Jan 2025 09:07:38 -0500 Subject: [PATCH 7/7] add dois to publishe archives, fix smol docs error and add vaild year check --- src/pudl_archiver/archivers/eia/eiamecs.py | 7 ++++--- src/pudl_archiver/cli.py | 2 +- src/pudl_archiver/package_data/zenodo_doi.yaml | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiamecs.py b/src/pudl_archiver/archivers/eia/eiamecs.py index 6fe24c8e..72220480 100644 --- a/src/pudl_archiver/archivers/eia/eiamecs.py +++ b/src/pudl_archiver/archivers/eia/eiamecs.py @@ -13,7 +13,7 @@ BASE_URL = "https://www.eia.gov/consumption/manufacturing/data" logger = logging.getLogger(f"catalystcoop.{__name__}") -TABLE_LINK_PATTERNS = { +TABLE_LINK_PATTERNS: dict[str | int, str] = { "recent": r"(RSE|)[Tt]able(\d{1,2}|\d{1.1})_(\d{1,2})(.xlsx|.xls)", 2002: r"(RSE|)[Tt]able(\d{1,2}).(\d{1,2})_\d{1,2}(.xlsx|.xls)", # These earlier years the pattern is functional but not actually very informative. @@ -22,7 +22,7 @@ 1994: r"((rse|)m\d{2}_(\d{2})([a-d]|)(.xlsx|.xls))", 1991: r"((rse|)mecs(\d{2})([a-z]|)(.xlsx|.xls))", } -"""Dictionary of years or "latest" as keys and table link patterns as values. +"""Dictionary of years or "recent" as keys and table link patterns as values. From 2006 and forward the link pattern is the same but all of the older years have bespoke table link patterns. The groups to match in the regex patterns @@ -55,7 +55,8 @@ async def get_resources(self) -> ArchiveAwaitable: for link in await self.get_hyperlinks(years_url, year_link_pattern): match = year_link_pattern.search(link) year = match.groups()[1] - yield self.get_year_resources(year) + if self.valid_year(year): + yield self.get_year_resources(year) async def get_year_resources(self, year: int) -> list[ResourceInfo]: """Download all excel tables for a year.""" diff --git a/src/pudl_archiver/cli.py b/src/pudl_archiver/cli.py index 352211e1..436c3946 100644 --- a/src/pudl_archiver/cli.py +++ b/src/pudl_archiver/cli.py @@ -29,7 +29,7 @@ def parse_main(args=None): nargs="*", help="Years to download data for. Supported datasets: censusdp1tract, censuspep, " "eia176, eia191, eia757a, eia860, eia860m, eia861, eia923, eia930, eia_bulk_elec, " - "eiaaeo, eiawater, epacamd_eia, epacems, ferc1, ferc2, ferc6, ferc60, ferc714, " + "eiaaeo, eiamecs, eiawater, epacamd_eia, epacems, ferc1, ferc2, ferc6, ferc60, ferc714, " "mshamines, nrelatb, phmsagas", type=int, ) diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml index 3de44abe..4cd9abb9 100644 --- a/src/pudl_archiver/package_data/zenodo_doi.yaml +++ b/src/pudl_archiver/package_data/zenodo_doi.yaml @@ -35,7 +35,8 @@ eia_bulk_elec: production_doi: 10.5281/zenodo.7067366 sandbox_doi: 10.5072/zenodo.2356 eiamecs: - sandbox_doi: 10.5072/zenodo.149504 + production_doi: 10.5281/zenodo.14749820 + sandbox_doi: 10.5072/zenodo.158873 eiawater: production_doi: 10.5281/zenodo.7683135 sandbox_doi: 10.5072/zenodo.3160