From 693714f530f29a30026c0e918cfc13d8b71408ca Mon Sep 17 00:00:00 2001 From: Nilay Kumar Date: Wed, 22 Jan 2025 14:15:42 -0500 Subject: [PATCH 1/9] Initial work on eiarecs --- src/pudl_archiver/archivers/eia/eiarecs.py | 60 ++++++++++++++++++++++ src/pudl_archiver/metadata/sources.py | 13 +++++ 2 files changed, 73 insertions(+) create mode 100644 src/pudl_archiver/archivers/eia/eiarecs.py diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py new file mode 100644 index 00000000..aa62ce15 --- /dev/null +++ b/src/pudl_archiver/archivers/eia/eiarecs.py @@ -0,0 +1,60 @@ +"""Archive EIA Residential Energy Consumption Survey (RECS).""" + +# TODO: +# - is the metadata done correctly? +# - do we want to just grab the zips? +# - do we want to zip everything up? +# - how to partition relative to the other tabs? +# - add in other years of data + +import logging +import re + +from pudl_archiver.archivers.classes import ( + AbstractDatasetArchiver, + ArchiveAwaitable, + ResourceInfo, +) + +BASE_URL = "https://www.eia.gov/consumption/residential/data" +logger = logging.getLogger(f"catalystcoop.{__name__}") + + +class EiaRECSArchiver(AbstractDatasetArchiver): + """EIA RECS archiver.""" + + name = "eiarecs" + + async def get_resources(self) -> ArchiveAwaitable: + """Download EIA-RECS resources.""" + for year in [2020]: + yield self.get_year_resources(year) + + async def get_year_resources(self, year: int) -> list[ResourceInfo]: + """Download all excel tables for a year.""" + table_link_pattern = re.compile(r"HC (\d{1,2}).(\d{1,2}).xlsx") + + # Loop through all download links for tables + tables = [] + year_url = f"{BASE_URL}/{year}" + for table_link in await self.get_hyperlinks(year_url, table_link_pattern): + table_link = f"{year_url}/{table_link}" + logger.info(f"Fetching {table_link}") + # Get table major/minor number from links + match = table_link_pattern.search(table_link) + major_num, minor_num = match.group(1), match.group(2) + + # Download file + download_path = ( + self.download_directory + / f"eia-recs-{year}-hc-{major_num}-{minor_num}.xlsx" + ) + await self.download_zipfile(table_link, download_path) + + tables.append( + ResourceInfo( + local_path=download_path, + partitions={"year": year, "hc": f"{major_num}_{minor_num}"}, + ) + ) + return tables diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py index ba63828b..28b8fe31 100644 --- a/src/pudl_archiver/metadata/sources.py +++ b/src/pudl_archiver/metadata/sources.py @@ -119,6 +119,7 @@ "contributors": [CONTRIBUTORS["catalyst-cooperative"]], }, "eiarecs": { +<<<<<<< HEAD "title": "EIA RECS -- Residential Energy Consumption Survey", "path": "https://www.eia.gov/consumption/residential/", "description": ( @@ -283,6 +284,18 @@ "avoided emissions", } ), +======= + "title": "EIA Residential Energy Consumption Survey", + "path": "https://www.eia.gov/consumption/residential/data/2020/", + "description": ( + "EIA Form 457 is commonly known as the Residential Energy Consumption Survey" + "(RECS). RECS is a national sample survey that collects detailed information" + "on household energy characteristics. The data is tabulated by geography" + "housing unit type, income, etc. RECS is conducted roughly every five years." + ), + "working_partitions": {"years": [2020, 2015, 2009, 2005, 2001, 1997, 1993]}, + "keywords": sorted({"residential", "RECS"}), +>>>>>>> 2a0933e (Initial work on eiarecs) "license_raw": LICENSES["us-govt"], "license_pudl": LICENSES["cc-by-4.0"], "contributors": [CONTRIBUTORS["catalyst-cooperative"]], From de35d0e40a150a816e128d131e18856e6453153c Mon Sep 17 00:00:00 2001 From: Nilay Kumar Date: Wed, 22 Jan 2025 14:25:25 -0500 Subject: [PATCH 2/9] Forgot to save the rebase edit (awkward) --- src/pudl_archiver/metadata/sources.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py index 28b8fe31..ba63828b 100644 --- a/src/pudl_archiver/metadata/sources.py +++ b/src/pudl_archiver/metadata/sources.py @@ -119,7 +119,6 @@ "contributors": [CONTRIBUTORS["catalyst-cooperative"]], }, "eiarecs": { -<<<<<<< HEAD "title": "EIA RECS -- Residential Energy Consumption Survey", "path": "https://www.eia.gov/consumption/residential/", "description": ( @@ -284,18 +283,6 @@ "avoided emissions", } ), -======= - "title": "EIA Residential Energy Consumption Survey", - "path": "https://www.eia.gov/consumption/residential/data/2020/", - "description": ( - "EIA Form 457 is commonly known as the Residential Energy Consumption Survey" - "(RECS). RECS is a national sample survey that collects detailed information" - "on household energy characteristics. The data is tabulated by geography" - "housing unit type, income, etc. RECS is conducted roughly every five years." - ), - "working_partitions": {"years": [2020, 2015, 2009, 2005, 2001, 1997, 1993]}, - "keywords": sorted({"residential", "RECS"}), ->>>>>>> 2a0933e (Initial work on eiarecs) "license_raw": LICENSES["us-govt"], "license_pudl": LICENSES["cc-by-4.0"], "contributors": [CONTRIBUTORS["catalyst-cooperative"]], From 274b7ddf9e944cf8e6c737455748194368309311 Mon Sep 17 00:00:00 2001 From: Nilay Kumar Date: Wed, 22 Jan 2025 17:17:48 -0500 Subject: [PATCH 3/9] Added consumption and state data --- src/pudl_archiver/archivers/eia/eiarecs.py | 85 +++++++++++++++------- 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py index aa62ce15..597085c7 100644 --- a/src/pudl_archiver/archivers/eia/eiarecs.py +++ b/src/pudl_archiver/archivers/eia/eiarecs.py @@ -1,8 +1,8 @@ """Archive EIA Residential Energy Consumption Survey (RECS).""" # TODO: -# - is the metadata done correctly? -# - do we want to just grab the zips? +# - grab all the data and then zip it up +# - make sure we're not missing anything with like ce1.2a.xlsx # - do we want to zip everything up? # - how to partition relative to the other tabs? # - add in other years of data @@ -16,7 +16,33 @@ ResourceInfo, ) -BASE_URL = "https://www.eia.gov/consumption/residential/data" +LINK_PATTERNS = [ + { + "base_url": "https://www.eia.gov/consumption/residential/data", + "php_extension": "index.php?view=characteristics", + "prefix": "hc", + "pattern": re.compile(r"HC (\d{1,2})\.(\d{1,2})\.xlsx"), + }, + { + "base_url": "https://www.eia.gov/consumption/residential/data", + "php_extension": "index.php?view=consumption", + "prefix": "ce", + "pattern": re.compile(r"ce(\d)\.(\d{1,2})[a-z]?\.xlsx"), + }, + { + "base_url": "https://www.eia.gov/consumption/residential/data", + "php_extension": "index.php?view=state", + "prefix": "state", + "pattern": re.compile(r"State (.*)\.xlsx"), + "no_version": True, + }, + { + "base_url": "https://www.eia.gov/consumption/residential/data", + "php_extension": "index.php?view=state", + "prefix": "state-ce", + "pattern": re.compile(r"ce(\d)\.(\d{1,2})\.(.*)\.xlsx"), + }, +] logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -32,29 +58,38 @@ async def get_resources(self) -> ArchiveAwaitable: async def get_year_resources(self, year: int) -> list[ResourceInfo]: """Download all excel tables for a year.""" - table_link_pattern = re.compile(r"HC (\d{1,2}).(\d{1,2}).xlsx") - # Loop through all download links for tables tables = [] - year_url = f"{BASE_URL}/{year}" - for table_link in await self.get_hyperlinks(year_url, table_link_pattern): - table_link = f"{year_url}/{table_link}" - logger.info(f"Fetching {table_link}") - # Get table major/minor number from links - match = table_link_pattern.search(table_link) - major_num, minor_num = match.group(1), match.group(2) - - # Download file - download_path = ( - self.download_directory - / f"eia-recs-{year}-hc-{major_num}-{minor_num}.xlsx" - ) - await self.download_zipfile(table_link, download_path) - - tables.append( - ResourceInfo( - local_path=download_path, - partitions={"year": year, "hc": f"{major_num}_{minor_num}"}, + for pattern_dict in LINK_PATTERNS: + year_url = f"{pattern_dict['base_url']}/{year}" + url = f"{year_url}/{pattern_dict['php_extension']}" + table_link_pattern = pattern_dict["pattern"] + for table_link in await self.get_hyperlinks(url, table_link_pattern): + table_link = f"{year_url}/{table_link}" + logger.info(f"Fetching {table_link}") + # Get table major/minor number from links + match = table_link_pattern.search(table_link) + output_filename = f"eia-recs-{year}-{pattern_dict['prefix']}" + if "no_version" in pattern_dict and pattern_dict["no_version"]: + output_filename += "-" + match.group(1).lower().replace(" ", "-") + else: + major_num, minor_num = ( + match.group(1), + match.group(2), + ) + output_filename += f"-{major_num}-{minor_num}" + if len(match.groups()) >= 3: + output_filename += "-" + match.group(3) + output_filename += ".xlsx" + + # Download file + download_path = self.download_directory / output_filename + await self.download_zipfile(table_link, download_path) + + tables.append( + ResourceInfo( + local_path=download_path, + partitions={"year": year}, + ) ) - ) return tables From abd35668337284a408e6ad1d1615ebe017942a93 Mon Sep 17 00:00:00 2001 From: Nilay Kumar Date: Wed, 22 Jan 2025 18:43:27 -0500 Subject: [PATCH 4/9] Adding files to zip --- src/pudl_archiver/archivers/eia/eiarecs.py | 48 +++++++++++++++------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py index 597085c7..679a3182 100644 --- a/src/pudl_archiver/archivers/eia/eiarecs.py +++ b/src/pudl_archiver/archivers/eia/eiarecs.py @@ -1,12 +1,5 @@ """Archive EIA Residential Energy Consumption Survey (RECS).""" -# TODO: -# - grab all the data and then zip it up -# - make sure we're not missing anything with like ce1.2a.xlsx -# - do we want to zip everything up? -# - how to partition relative to the other tabs? -# - add in other years of data - import logging import re @@ -15,33 +8,45 @@ ArchiveAwaitable, ResourceInfo, ) +from pudl_archiver.frictionless import ZipLayout LINK_PATTERNS = [ + # housing characteristics { "base_url": "https://www.eia.gov/consumption/residential/data", "php_extension": "index.php?view=characteristics", "prefix": "hc", "pattern": re.compile(r"HC (\d{1,2})\.(\d{1,2})\.xlsx"), }, + # consumption & expenditures { "base_url": "https://www.eia.gov/consumption/residential/data", "php_extension": "index.php?view=consumption", "prefix": "ce", - "pattern": re.compile(r"ce(\d)\.(\d{1,2})[a-z]?\.xlsx"), + "pattern": re.compile(r"ce(\d)\.(\d{1,2})([a-z]?)\.xlsx"), }, + # state data (housing characteristics) { "base_url": "https://www.eia.gov/consumption/residential/data", "php_extension": "index.php?view=state", "prefix": "state", "pattern": re.compile(r"State (.*)\.xlsx"), - "no_version": True, }, + # state data (consumption & expenditures) { "base_url": "https://www.eia.gov/consumption/residential/data", "php_extension": "index.php?view=state", "prefix": "state-ce", "pattern": re.compile(r"ce(\d)\.(\d{1,2})\.(.*)\.xlsx"), }, + # microdata + # adding this in will require major changes+cleanup to the code below + # { + # "base_url": "https://www.eia.gov/consumption/residential/data", + # "php_extension": "index.php?view=microdata", + # "prefix": "udata", + # "pattern": re.compile(r"(recs.*\d{4}.*public.*)\.(?:zip|csv|xlsx)", re.IGNORECASE), + # } ] logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -60,7 +65,11 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: """Download all excel tables for a year.""" # Loop through all download links for tables tables = [] + zip_path = self.download_directory / f"eia-recs-{year}.zip" + data_paths_in_archive = set() + # Loop through different categories of data (all .xlsx) for pattern_dict in LINK_PATTERNS: + # Each category of data has its own url, etc. year_url = f"{pattern_dict['base_url']}/{year}" url = f"{year_url}/{pattern_dict['php_extension']}" table_link_pattern = pattern_dict["pattern"] @@ -69,27 +78,38 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: logger.info(f"Fetching {table_link}") # Get table major/minor number from links match = table_link_pattern.search(table_link) + # We've gotta do a bit of wrangling to get the output filename + # to match the url somewhat + n_groups = len(match.groups()) output_filename = f"eia-recs-{year}-{pattern_dict['prefix']}" - if "no_version" in pattern_dict and pattern_dict["no_version"]: - output_filename += "-" + match.group(1).lower().replace(" ", "-") + if n_groups == 1: + output_filename += "-" + match.group(1).lower().replace(" ", "_") else: major_num, minor_num = ( match.group(1), match.group(2), ) output_filename += f"-{major_num}-{minor_num}" - if len(match.groups()) >= 3: + if n_groups == 3 and match.group(3) != "": output_filename += "-" + match.group(3) output_filename += ".xlsx" # Download file download_path = self.download_directory / output_filename - await self.download_zipfile(table_link, download_path) + await self.download_file(table_link, download_path) + self.add_to_archive( + zip_path=zip_path, + filename=output_filename, + blob=download_path.open("rb"), + ) + data_paths_in_archive.add(output_filename) + download_path.unlink() tables.append( ResourceInfo( - local_path=download_path, + local_path=zip_path, partitions={"year": year}, + layout=ZipLayout(file_paths=data_paths_in_archive), ) ) return tables From 67cbc8ce9affac23bc124387d124c82efc411266 Mon Sep 17 00:00:00 2001 From: Nilay Kumar Date: Wed, 22 Jan 2025 18:54:12 -0500 Subject: [PATCH 5/9] Removing accidental zipping in loop --- src/pudl_archiver/archivers/eia/eiarecs.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py index 679a3182..310f4034 100644 --- a/src/pudl_archiver/archivers/eia/eiarecs.py +++ b/src/pudl_archiver/archivers/eia/eiarecs.py @@ -105,11 +105,11 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: data_paths_in_archive.add(output_filename) download_path.unlink() - tables.append( - ResourceInfo( - local_path=zip_path, - partitions={"year": year}, - layout=ZipLayout(file_paths=data_paths_in_archive), - ) - ) + tables.append( + ResourceInfo( + local_path=zip_path, + partitions={"year": year}, + layout=ZipLayout(file_paths=data_paths_in_archive), + ) + ) return tables From b08b0e1cc333634c9fba0aaa2084dabc689d9cd8 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Wed, 29 Jan 2025 16:12:52 -0500 Subject: [PATCH 6/9] chore: replace output filename munging code --- src/pudl_archiver/archivers/eia/eiarecs.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py index 310f4034..3fa27105 100644 --- a/src/pudl_archiver/archivers/eia/eiarecs.py +++ b/src/pudl_archiver/archivers/eia/eiarecs.py @@ -78,21 +78,12 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: logger.info(f"Fetching {table_link}") # Get table major/minor number from links match = table_link_pattern.search(table_link) - # We've gotta do a bit of wrangling to get the output filename - # to match the url somewhat - n_groups = len(match.groups()) - output_filename = f"eia-recs-{year}-{pattern_dict['prefix']}" - if n_groups == 1: - output_filename += "-" + match.group(1).lower().replace(" ", "_") - else: - major_num, minor_num = ( - match.group(1), - match.group(2), - ) - output_filename += f"-{major_num}-{minor_num}" - if n_groups == 3 and match.group(3) != "": - output_filename += "-" + match.group(3) - output_filename += ".xlsx" + matched_metadata = ( + "-".join(g for g in match.groups() if g).replace(" ", "_").lower() + ) + output_filename = ( + f"eia-recs-{year}-{pattern_dict['prefix']}-{matched_metadata}.xlsx" + ) # Download file download_path = self.download_directory / output_filename From 1b874173074d6fa1d109e59907f12aee79719c58 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Wed, 29 Jan 2025 18:13:31 -0500 Subject: [PATCH 7/9] feat: add 2020 microdata + methodology --- src/pudl_archiver/archivers/eia/eiarecs.py | 134 ++++++++++++--------- 1 file changed, 78 insertions(+), 56 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py index 3fa27105..81fd611d 100644 --- a/src/pudl_archiver/archivers/eia/eiarecs.py +++ b/src/pudl_archiver/archivers/eia/eiarecs.py @@ -2,6 +2,9 @@ import logging import re +from dataclasses import dataclass +from io import BytesIO +from urllib.parse import urljoin from pudl_archiver.archivers.classes import ( AbstractDatasetArchiver, @@ -10,44 +13,60 @@ ) from pudl_archiver.frictionless import ZipLayout -LINK_PATTERNS = [ - # housing characteristics - { - "base_url": "https://www.eia.gov/consumption/residential/data", - "php_extension": "index.php?view=characteristics", - "prefix": "hc", - "pattern": re.compile(r"HC (\d{1,2})\.(\d{1,2})\.xlsx"), - }, - # consumption & expenditures - { - "base_url": "https://www.eia.gov/consumption/residential/data", - "php_extension": "index.php?view=consumption", - "prefix": "ce", - "pattern": re.compile(r"ce(\d)\.(\d{1,2})([a-z]?)\.xlsx"), - }, - # state data (housing characteristics) - { - "base_url": "https://www.eia.gov/consumption/residential/data", - "php_extension": "index.php?view=state", - "prefix": "state", - "pattern": re.compile(r"State (.*)\.xlsx"), - }, - # state data (consumption & expenditures) - { - "base_url": "https://www.eia.gov/consumption/residential/data", - "php_extension": "index.php?view=state", - "prefix": "state-ce", - "pattern": re.compile(r"ce(\d)\.(\d{1,2})\.(.*)\.xlsx"), - }, - # microdata - # adding this in will require major changes+cleanup to the code below - # { - # "base_url": "https://www.eia.gov/consumption/residential/data", - # "php_extension": "index.php?view=microdata", - # "prefix": "udata", - # "pattern": re.compile(r"(recs.*\d{4}.*public.*)\.(?:zip|csv|xlsx)", re.IGNORECASE), - # } -] + +@dataclass +class LinkSet: + """Information a set of links in one tab of the RECS viewer. + + See https://www.eia.gov/consumption/residential/data/2020/. + """ + + url: str + short_name: str + pattern: re.Pattern + + +def _url_for(year: int, view: str): + """Get the URL for a specific RECS year/tab combo.""" + return ( + f"https://www.eia.gov/consumption/residential/data/{year}/index.php?view={view}" + ) + + +YEAR_LINK_SETS = { + 2020: { + "housing_characteristics": LinkSet( + url=_url_for(year=2020, view="characteristics"), + short_name="hc", + pattern=re.compile(r"HC (\d{1,2}\.\d{1,2})\.(xlsx)"), + ), + "consumption & expenditures": LinkSet( + url=_url_for(year=2020, view="consumption"), + short_name="ce", + pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.(xlsx)"), + ), + "state data (housing characteristics)": LinkSet( + url=_url_for(year=2020, view="state"), + short_name="state", + pattern=re.compile(r"State (.*)\.(xlsx)"), + ), + "state data (consumption & expenditures)": LinkSet( + url=_url_for(year=2020, view="state"), + short_name="state-ce", + pattern=re.compile(r"ce(\d\.\d{1,2}\..*)\.(xlsx)"), + ), + "microdata": LinkSet( + url=_url_for(year=2020, view="microdata"), + short_name="microdata", + pattern=re.compile(r"(recs.*public.*)\.(csv)"), + ), + "methodology": LinkSet( + url=_url_for(year=2020, view="methodology"), + short_name="methodology", + pattern=re.compile(r"pdf/(.+)\.(pdf)"), + ), + } +} logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -61,6 +80,11 @@ async def get_resources(self) -> ArchiveAwaitable: for year in [2020]: yield self.get_year_resources(year) + def __is_html_file(self, fileobj: BytesIO) -> bool: + header = fileobj.read(30).lower().strip() + fileobj.seek(0) + return b"" in header + async def get_year_resources(self, year: int) -> list[ResourceInfo]: """Download all excel tables for a year.""" # Loop through all download links for tables @@ -68,31 +92,29 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: zip_path = self.download_directory / f"eia-recs-{year}.zip" data_paths_in_archive = set() # Loop through different categories of data (all .xlsx) - for pattern_dict in LINK_PATTERNS: - # Each category of data has its own url, etc. - year_url = f"{pattern_dict['base_url']}/{year}" - url = f"{year_url}/{pattern_dict['php_extension']}" - table_link_pattern = pattern_dict["pattern"] - for table_link in await self.get_hyperlinks(url, table_link_pattern): - table_link = f"{year_url}/{table_link}" + link_sets = YEAR_LINK_SETS[year] + for link_set in link_sets.values(): + for table_link in await self.get_hyperlinks(link_set.url, link_set.pattern): + table_link = urljoin(link_set.url, table_link) logger.info(f"Fetching {table_link}") - # Get table major/minor number from links - match = table_link_pattern.search(table_link) + match = link_set.pattern.search(table_link) matched_metadata = ( - "-".join(g for g in match.groups() if g).replace(" ", "_").lower() - ) - output_filename = ( - f"eia-recs-{year}-{pattern_dict['prefix']}-{matched_metadata}.xlsx" + match.group(1).replace(".", "-").replace(" ", "_").lower() ) + matched_format = match.group(2) + output_filename = f"eia-recs-{year}-{link_set.short_name}-{matched_metadata}.{matched_format}" # Download file download_path = self.download_directory / output_filename await self.download_file(table_link, download_path) - self.add_to_archive( - zip_path=zip_path, - filename=output_filename, - blob=download_path.open("rb"), - ) + with download_path.open("rb") as f: + if self.__is_html_file(f): + continue + self.add_to_archive( + zip_path=zip_path, + filename=output_filename, + blob=f, + ) data_paths_in_archive.add(output_filename) download_path.unlink() From f06f7e3b938de545a80460cdf0d62329687813ff Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Wed, 29 Jan 2025 18:40:59 -0500 Subject: [PATCH 8/9] feat: add 2015 2015 methodology required some changes to allow for downloading html files. --- src/pudl_archiver/archivers/eia/eiarecs.py | 99 ++++++++++++++++------ 1 file changed, 75 insertions(+), 24 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py index 81fd611d..8f537d85 100644 --- a/src/pudl_archiver/archivers/eia/eiarecs.py +++ b/src/pudl_archiver/archivers/eia/eiarecs.py @@ -21,9 +21,11 @@ class LinkSet: See https://www.eia.gov/consumption/residential/data/2020/. """ - url: str + view: str short_name: str + extension: str pattern: re.Pattern + skip_if_html: bool = True def _url_for(year: int, view: str): @@ -36,36 +38,81 @@ def _url_for(year: int, view: str): YEAR_LINK_SETS = { 2020: { "housing_characteristics": LinkSet( - url=_url_for(year=2020, view="characteristics"), + view="characteristics", short_name="hc", - pattern=re.compile(r"HC (\d{1,2}\.\d{1,2})\.(xlsx)"), + pattern=re.compile(r"HC (\d{1,2}\.\d{1,2})\.xlsx"), + extension="xlsx", ), "consumption & expenditures": LinkSet( - url=_url_for(year=2020, view="consumption"), + view="consumption", short_name="ce", - pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.(xlsx)"), + pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.xlsx"), + extension="xlsx", ), "state data (housing characteristics)": LinkSet( - url=_url_for(year=2020, view="state"), - short_name="state", - pattern=re.compile(r"State (.*)\.(xlsx)"), + view="state", + short_name="state-hc", + pattern=re.compile(r"State (.*)\.xlsx"), + extension="xlsx", ), "state data (consumption & expenditures)": LinkSet( - url=_url_for(year=2020, view="state"), + view="state", short_name="state-ce", - pattern=re.compile(r"ce(\d\.\d{1,2}\..*)\.(xlsx)"), + pattern=re.compile(r"ce(\d\.\d{1,2}\..*)\.xlsx"), + extension="xlsx", ), "microdata": LinkSet( - url=_url_for(year=2020, view="microdata"), + view="microdata", short_name="microdata", - pattern=re.compile(r"(recs.*public.*)\.(csv)"), + pattern=re.compile(r"(recs.*public.*)\.csv"), + extension="csv", + ), + "microdata-codebook": LinkSet( + view="microdata", + short_name="microdata", + pattern=re.compile(r"(RECS 2020 Codebook.*v.)\.xlsx"), + extension="xlsx", + ), + "methodology": LinkSet( + view="methodology", + short_name="methodology", + pattern=re.compile(r"pdf/(.+)\.pdf"), + extension="pdf", + ), + }, + 2015: { + "housing_characteristics": LinkSet( + view="characteristics", + short_name="hc", + pattern=re.compile(r"hc(\d{1,2}\.\d{1,2})\.xlsx"), + extension="xlsx", + ), + "consumption & expenditures": LinkSet( + view="consumption", + short_name="ce", + pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.xlsx"), + extension="xlsx", + ), + "microdata": LinkSet( + view="microdata", + short_name="microdata", + pattern=re.compile(r"(recs.*public.*)\.csv"), + extension="csv", + ), + "microdata-codebook": LinkSet( + view="microdata", + short_name="microdata", + pattern=re.compile(r"(codebook.*)\.xlsx"), + extension="xlsx", ), "methodology": LinkSet( - url=_url_for(year=2020, view="methodology"), + view="methodology", short_name="methodology", - pattern=re.compile(r"pdf/(.+)\.(pdf)"), + pattern=re.compile(r"/consumption/residential/reports/2015/(.+)(\.php)?"), + extension="html", + skip_if_html=False, ), - } + }, } logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -77,13 +124,13 @@ class EiaRECSArchiver(AbstractDatasetArchiver): async def get_resources(self) -> ArchiveAwaitable: """Download EIA-RECS resources.""" - for year in [2020]: + for year in [2020, 2015]: yield self.get_year_resources(year) def __is_html_file(self, fileobj: BytesIO) -> bool: header = fileobj.read(30).lower().strip() fileobj.seek(0) - return b"" in header + return b" list[ResourceInfo]: """Download all excel tables for a year.""" @@ -94,21 +141,25 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: # Loop through different categories of data (all .xlsx) link_sets = YEAR_LINK_SETS[year] for link_set in link_sets.values(): - for table_link in await self.get_hyperlinks(link_set.url, link_set.pattern): - table_link = urljoin(link_set.url, table_link) + url = _url_for(year, link_set.view) + for table_link in await self.get_hyperlinks(url, link_set.pattern): + table_link = urljoin(url, table_link).strip("/") logger.info(f"Fetching {table_link}") match = link_set.pattern.search(table_link) - matched_metadata = ( - match.group(1).replace(".", "-").replace(" ", "_").lower() + matched_filename = ( + match.group(1) + .replace(".", "-") + .replace(" ", "_") + .replace("/", "-") + .lower() ) - matched_format = match.group(2) - output_filename = f"eia-recs-{year}-{link_set.short_name}-{matched_metadata}.{matched_format}" + output_filename = f"eia-recs-{year}-{link_set.short_name}-{matched_filename}.{link_set.extension}" # Download file download_path = self.download_directory / output_filename await self.download_file(table_link, download_path) with download_path.open("rb") as f: - if self.__is_html_file(f): + if link_set.skip_if_html and self.__is_html_file(f): continue self.add_to_archive( zip_path=zip_path, From ed70331dd835564211fb1533707932355aabcf82 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Wed, 29 Jan 2025 19:14:20 -0500 Subject: [PATCH 9/9] feat: add 2009 and historical 457 forms --- src/pudl_archiver/archivers/eia/eiarecs.py | 88 +++++++++++++++++----- 1 file changed, 70 insertions(+), 18 deletions(-) diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py index 8f537d85..c57fb267 100644 --- a/src/pudl_archiver/archivers/eia/eiarecs.py +++ b/src/pudl_archiver/archivers/eia/eiarecs.py @@ -13,6 +13,8 @@ ) from pudl_archiver.frictionless import ZipLayout +logger = logging.getLogger(f"catalystcoop.{__name__}") + @dataclass class LinkSet: @@ -21,7 +23,7 @@ class LinkSet: See https://www.eia.gov/consumption/residential/data/2020/. """ - view: str + url: str short_name: str extension: str pattern: re.Pattern @@ -35,86 +37,137 @@ def _url_for(year: int, view: str): ) +# Each year, each tab's format changes. Rather than have complicated regexes that capture everything, just have lots of simple regexes YEAR_LINK_SETS = { 2020: { "housing_characteristics": LinkSet( - view="characteristics", + url=_url_for(year=2020, view="characteristics"), short_name="hc", pattern=re.compile(r"HC (\d{1,2}\.\d{1,2})\.xlsx"), extension="xlsx", ), "consumption & expenditures": LinkSet( - view="consumption", + url=_url_for(year=2020, view="consumption"), short_name="ce", pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.xlsx"), extension="xlsx", ), "state data (housing characteristics)": LinkSet( - view="state", + url=_url_for(year=2020, view="state"), short_name="state-hc", pattern=re.compile(r"State (.*)\.xlsx"), extension="xlsx", ), "state data (consumption & expenditures)": LinkSet( - view="state", + url=_url_for(year=2020, view="state"), short_name="state-ce", pattern=re.compile(r"ce(\d\.\d{1,2}\..*)\.xlsx"), extension="xlsx", ), "microdata": LinkSet( - view="microdata", + url=_url_for(year=2020, view="microdata"), short_name="microdata", pattern=re.compile(r"(recs.*public.*)\.csv"), extension="csv", ), "microdata-codebook": LinkSet( - view="microdata", + url=_url_for(year=2020, view="microdata"), short_name="microdata", pattern=re.compile(r"(RECS 2020 Codebook.*v.)\.xlsx"), extension="xlsx", ), "methodology": LinkSet( - view="methodology", + url=_url_for(year=2020, view="methodology"), short_name="methodology", pattern=re.compile(r"pdf/(.+)\.pdf"), extension="pdf", ), + "methodology-forms": LinkSet( + url="https://www.eia.gov/survey/#eia-457", + short_name="methodology", + pattern=re.compile(r"eia_457/archive/2020_(.+)\.pdf"), + extension="pdf", + ), }, 2015: { "housing_characteristics": LinkSet( - view="characteristics", + url=_url_for(year=2015, view="characteristics"), short_name="hc", pattern=re.compile(r"hc(\d{1,2}\.\d{1,2})\.xlsx"), extension="xlsx", ), "consumption & expenditures": LinkSet( - view="consumption", + url=_url_for(year=2015, view="consumption"), short_name="ce", pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.xlsx"), extension="xlsx", ), "microdata": LinkSet( - view="microdata", + url=_url_for(year=2015, view="microdata"), short_name="microdata", pattern=re.compile(r"(recs.*public.*)\.csv"), extension="csv", ), "microdata-codebook": LinkSet( - view="microdata", + url=_url_for(year=2015, view="microdata"), + short_name="microdata", + pattern=re.compile(r"(codebook.*)\.xlsx"), + extension="xlsx", + ), + "methodology": LinkSet( + url=_url_for(year=2015, view="methodology"), + short_name="methodology", + pattern=re.compile(r"/consumption/residential/reports/2015/(.+)(\.php)?"), + extension="html", + skip_if_html=False, + ), + "methodology-forms": LinkSet( + url="https://www.eia.gov/survey/#eia-457", + short_name="methodology", + pattern=re.compile(r"eia_457/archive/2015_(.+)\.pdf"), + extension="pdf", + ), + }, + 2009: { + "housing_characteristics": LinkSet( + url=_url_for(year=2009, view="characteristics"), + short_name="hc", + pattern=re.compile(r"hc(\d{1,2}\.\d{1,2})\.xlsx"), + extension="xlsx", + ), + "consumption & expenditures": LinkSet( + url=_url_for(year=2009, view="consumption"), + short_name="ce", + pattern=re.compile(r"ce(\d\.\d{1,2}[a-z]?)\.xlsx"), + extension="xlsx", + ), + "microdata": LinkSet( + url=_url_for(year=2009, view="microdata"), + short_name="microdata", + pattern=re.compile(r"csv/(.*)\.csv"), + extension="csv", + ), + "microdata-codebook": LinkSet( + url=_url_for(year=2009, view="microdata"), short_name="microdata", pattern=re.compile(r"(codebook.*)\.xlsx"), extension="xlsx", ), "methodology": LinkSet( - view="methodology", + url=_url_for(year=2009, view="methodology"), short_name="methodology", pattern=re.compile(r"/consumption/residential/reports/2015/(.+)(\.php)?"), extension="html", skip_if_html=False, ), + "methodology-forms": LinkSet( + url="https://www.eia.gov/survey/#eia-457", + short_name="methodology", + pattern=re.compile(r"eia_457/archive/2009 (.+)\.pdf"), + extension="pdf", + ), }, } -logger = logging.getLogger(f"catalystcoop.{__name__}") class EiaRECSArchiver(AbstractDatasetArchiver): @@ -124,7 +177,7 @@ class EiaRECSArchiver(AbstractDatasetArchiver): async def get_resources(self) -> ArchiveAwaitable: """Download EIA-RECS resources.""" - for year in [2020, 2015]: + for year in [2020, 2015, 2009]: yield self.get_year_resources(year) def __is_html_file(self, fileobj: BytesIO) -> bool: @@ -141,9 +194,8 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]: # Loop through different categories of data (all .xlsx) link_sets = YEAR_LINK_SETS[year] for link_set in link_sets.values(): - url = _url_for(year, link_set.view) - for table_link in await self.get_hyperlinks(url, link_set.pattern): - table_link = urljoin(url, table_link).strip("/") + for table_link in await self.get_hyperlinks(link_set.url, link_set.pattern): + table_link = urljoin(link_set.url, table_link).strip("/") logger.info(f"Fetching {table_link}") match = link_set.pattern.search(table_link) matched_filename = (