From 9504debede7224eb35c313b6ac07b8d5c4a889cd Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Wed, 22 Jan 2025 16:18:06 -0500 Subject: [PATCH 01/19] [wip] feat: permit get_hyperlinks to accept a 'headers' argument that gets passed thru to session.get --- src/pudl_archiver/archivers/classes.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/pudl_archiver/archivers/classes.py b/src/pudl_archiver/archivers/classes.py index 64006546..299e8c96 100644 --- a/src/pudl_archiver/archivers/classes.py +++ b/src/pudl_archiver/archivers/classes.py @@ -223,6 +223,7 @@ async def get_hyperlinks( url: str, filter_pattern: typing.Pattern | None = None, verify: bool = True, + headers: dict | None = None, ) -> list[str]: """Return all hyperlinks from a specific web page. @@ -240,14 +241,16 @@ async def get_hyperlinks( parser = _HyperlinkExtractor() response = await retry_async( - self.session.get, args=[url], kwargs={"ssl": verify} + self.session.get, args=[url], kwargs={"ssl": verify, + **({"headers":headers} if headers is not None else {})} ) text = await retry_async(response.text) parser.feed(text) # Filter to those that match filter_pattern hyperlinks = parser.hyperlinks - if filter_pattern: + if filter_pattern is not None: + self.logger.info(f"Filtering using {filter_pattern}") hyperlinks = {link for link in hyperlinks if filter_pattern.search(link)} # Warn if no links are found From 7fe9ce47ea8050fc5d4e5e745fb67827a872426b Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Wed, 22 Jan 2025 16:20:11 -0500 Subject: [PATCH 02/19] [wip] feat: add new archiver for DOE LEAD Todo: - Crawl resulting oie link to fetch state-year links - Download state-year links and zip into year partitions - Return ResourceInfo --- src/pudl_archiver/archivers/doelead.py | 47 ++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 src/pudl_archiver/archivers/doelead.py diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py new file mode 100644 index 00000000..65d01f69 --- /dev/null +++ b/src/pudl_archiver/archivers/doelead.py @@ -0,0 +1,47 @@ +"""Download DOE LEAD data.""" + +import re + +from pudl_archiver.archivers.classes import ( + AbstractDatasetArchiver, + ArchiveAwaitable, + ResourceInfo, +) + +BASE_URL = "https://www.energy.gov/scep/low-income-energy-affordability-data-lead-tool" + +# verified working 2025-01-22 via +# $ wget "https://www.energy.gov/scep/low-income-energy-affordability-data-lead-tool" -O foo.html -U "Mozilla/5.0 Catalyst/2025 Cooperative/2025" +HEADERS = {"User-Agent":"Mozilla/5.0 Catalyst/2025 Cooperative/2025"} + +class DoeLeadArchiver(AbstractDatasetArchiver): + """DOE LEAD archiver.""" + + name = "doelead" + + async def get_resources(self) -> ArchiveAwaitable: + """Download DOE LEAD resources.""" + # https://data.openei.org/submissions/6219 + link_pattern = re.compile(r"data.openei.org") + for link in await self.get_hyperlinks(BASE_URL, link_pattern, headers=HEADERS): + matches = link_pattern.search(link) + if not matches: + continue + self.logger.info(f"LINK: {link}") + if False: + yield self.get_year_resource() + self.logger.info("ALL DONE") +# yield self.get_year_resource() +# year = int(matches.group(1)) +# if self.valid_year(year): +# yield self.get_year_resource(link, year) + + async def get_year_resource(self) -> ResourceInfo: + """Download zip file.""" + # Append hyperlink to base URL to get URL of file + return ResourceInfo(local_path=self.download_directory / "foo", partitions={}) +# url = f"{BASE_URL}/{link}" +# download_path = self.download_directory / f"eia860-{year}.zip" +# await self.download_zipfile(url, download_path) +# +# return ResourceInfo(local_path=download_path, partitions={"year": year}) From c5ba88ef64a9a0886e7ba02f3904186fa427fb91 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Jan 2025 21:35:14 +0000 Subject: [PATCH 03/19] [pre-commit.ci] auto fixes from pre-commit.com hooks For more information, see https://pre-commit.ci --- src/pudl_archiver/archivers/classes.py | 8 ++++++-- src/pudl_archiver/archivers/doelead.py | 18 +++++++++++------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/pudl_archiver/archivers/classes.py b/src/pudl_archiver/archivers/classes.py index 299e8c96..c2346b28 100644 --- a/src/pudl_archiver/archivers/classes.py +++ b/src/pudl_archiver/archivers/classes.py @@ -241,8 +241,12 @@ async def get_hyperlinks( parser = _HyperlinkExtractor() response = await retry_async( - self.session.get, args=[url], kwargs={"ssl": verify, - **({"headers":headers} if headers is not None else {})} + self.session.get, + args=[url], + kwargs={ + "ssl": verify, + **({"headers": headers} if headers is not None else {}), + }, ) text = await retry_async(response.text) parser.feed(text) diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py index 65d01f69..bc2cf15e 100644 --- a/src/pudl_archiver/archivers/doelead.py +++ b/src/pudl_archiver/archivers/doelead.py @@ -12,7 +12,8 @@ # verified working 2025-01-22 via # $ wget "https://www.energy.gov/scep/low-income-energy-affordability-data-lead-tool" -O foo.html -U "Mozilla/5.0 Catalyst/2025 Cooperative/2025" -HEADERS = {"User-Agent":"Mozilla/5.0 Catalyst/2025 Cooperative/2025"} +HEADERS = {"User-Agent": "Mozilla/5.0 Catalyst/2025 Cooperative/2025"} + class DoeLeadArchiver(AbstractDatasetArchiver): """DOE LEAD archiver.""" @@ -29,19 +30,22 @@ async def get_resources(self) -> ArchiveAwaitable: continue self.logger.info(f"LINK: {link}") if False: - yield self.get_year_resource() + yield self.get_year_resource() self.logger.info("ALL DONE") -# yield self.get_year_resource() -# year = int(matches.group(1)) -# if self.valid_year(year): -# yield self.get_year_resource(link, year) + + # yield self.get_year_resource() + # year = int(matches.group(1)) + # if self.valid_year(year): + # yield self.get_year_resource(link, year) async def get_year_resource(self) -> ResourceInfo: """Download zip file.""" # Append hyperlink to base URL to get URL of file return ResourceInfo(local_path=self.download_directory / "foo", partitions={}) + + # url = f"{BASE_URL}/{link}" # download_path = self.download_directory / f"eia860-{year}.zip" # await self.download_zipfile(url, download_path) -# +# # return ResourceInfo(local_path=download_path, partitions={"year": year}) From fe0d44e2e63712ebc2eaa5f5e97dc60f7c8e7fb2 Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Wed, 22 Jan 2025 17:36:23 -0500 Subject: [PATCH 04/19] the rest of the owl --- src/pudl_archiver/archivers/classes.py | 8 +-- src/pudl_archiver/archivers/doelead.py | 68 ++++++++++++++++++-------- 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/src/pudl_archiver/archivers/classes.py b/src/pudl_archiver/archivers/classes.py index 299e8c96..a93e5bbe 100644 --- a/src/pudl_archiver/archivers/classes.py +++ b/src/pudl_archiver/archivers/classes.py @@ -236,6 +236,7 @@ async def get_hyperlinks( url: URL of web page. filter_pattern: If present, only return links that contain pattern. verify: Verify ssl certificate (EPACEMS https source has bad certificate). + headers: Additional headers to send in the GET request. """ # Parse web page to get all hyperlinks parser = _HyperlinkExtractor() @@ -249,15 +250,14 @@ async def get_hyperlinks( # Filter to those that match filter_pattern hyperlinks = parser.hyperlinks - if filter_pattern is not None: - self.logger.info(f"Filtering using {filter_pattern}") + if filter_pattern: hyperlinks = {link for link in hyperlinks if filter_pattern.search(link)} # Warn if no links are found if not hyperlinks: self.logger.warning( - f"The archiver couldn't find any hyperlinks that match {filter_pattern}." - f"Make sure your filter_pattern is correct or if the structure of the {url} page changed." + f"The archiver couldn't find any hyperlinks{('that match' + filter_pattern) if filter_pattern else ''}." + f"Make sure your filter_pattern is correct, check if the structure of the {url} page changed, or if you are missing HTTP headers." ) return hyperlinks diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py index 65d01f69..40605cba 100644 --- a/src/pudl_archiver/archivers/doelead.py +++ b/src/pudl_archiver/archivers/doelead.py @@ -21,27 +21,53 @@ class DoeLeadArchiver(AbstractDatasetArchiver): async def get_resources(self) -> ArchiveAwaitable: """Download DOE LEAD resources.""" - # https://data.openei.org/submissions/6219 - link_pattern = re.compile(r"data.openei.org") - for link in await self.get_hyperlinks(BASE_URL, link_pattern, headers=HEADERS): - matches = link_pattern.search(link) - if not matches: - continue - self.logger.info(f"LINK: {link}") - if False: - yield self.get_year_resource() + # e.g.: https://data.openei.org/submissions/6219 + oei_link_pattern = re.compile(r"data\.openei\.org/submissions") + # e.g.: https://data.openei.org/files/6219/DC-2022-LEAD-data.zip + # https://data.openei.org/files/6219/Data%20Dictionary%202022.xlsx + # https://data.openei.org/files/6219/LEAD%20Tool%20States%20List%202022.xlsx + data_link_pattern = re.compile(r"([^/]+(\d{4})(?:-LEAD-data.zip|.xlsx))") + for oei_link in await self.get_hyperlinks(BASE_URL, oei_link_pattern, headers=HEADERS): + self.logger.info(f"LEAD tool raw dataset: {oei_link}") + year_links = {} + oei_year = -1 + for data_link in await self.get_hyperlinks(oei_link, data_link_pattern): + matches = data_link_pattern.search(data_link) + if not matches: + continue + link_year = int(matches.group(2)) + if oei_year < 0: + oei_year = link_year + else: + if oei_year != link_year: + self.logger.warning(f"Mixed years found at {oei_link}: {oei_year}, {link_year} from {data_link}") + self.logger.debug(f"OEI data: {data_link}") + year_links[matches.group(1)] = data_link + if year_links: + self.logger.info(f"Downloading: {oei_year}, {len(year_links)} items") + yield self.get_year_resource(year_links, oei_year) self.logger.info("ALL DONE") -# yield self.get_year_resource() -# year = int(matches.group(1)) -# if self.valid_year(year): -# yield self.get_year_resource(link, year) - async def get_year_resource(self) -> ResourceInfo: + async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceInfo: """Download zip file.""" - # Append hyperlink to base URL to get URL of file - return ResourceInfo(local_path=self.download_directory / "foo", partitions={}) -# url = f"{BASE_URL}/{link}" -# download_path = self.download_directory / f"eia860-{year}.zip" -# await self.download_zipfile(url, download_path) -# -# return ResourceInfo(local_path=download_path, partitions={"year": year}) + host = "https://data.openei.org" + zip_path = self.download_directory / f"doelead-{year}.zip" + data_paths_in_archive = set() + for filename, link in sorted(links.items()): + self.logger.info(f"Downloading {link}") + download_path = self.download_directory / filename + await self.download_file(f"{host}{link}", download_path) + self.add_to_archive( + zip_path=zip_path, + filename=filename, + blob=download_path.open("rb"), + ) + data_paths_in_archive.add(filename) + # Don't want to leave multiple giant files on disk, so delete + # immediately after they're safely stored in the ZIP + download_path.unlink() + return ResourceInfo( + local_path=zip_path, + partitions={"year": year}, + layout=ZipLayout(file_paths=data_paths_in_archive) + ) From 11704765caa72b4efbf723b465f4c0c4e28442c6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Jan 2025 22:42:16 +0000 Subject: [PATCH 05/19] [pre-commit.ci] auto fixes from pre-commit.com hooks For more information, see https://pre-commit.ci --- src/pudl_archiver/archivers/doelead.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py index c1b47847..4bcb7359 100644 --- a/src/pudl_archiver/archivers/doelead.py +++ b/src/pudl_archiver/archivers/doelead.py @@ -28,7 +28,9 @@ async def get_resources(self) -> ArchiveAwaitable: # https://data.openei.org/files/6219/Data%20Dictionary%202022.xlsx # https://data.openei.org/files/6219/LEAD%20Tool%20States%20List%202022.xlsx data_link_pattern = re.compile(r"([^/]+(\d{4})(?:-LEAD-data.zip|.xlsx))") - for oei_link in await self.get_hyperlinks(BASE_URL, oei_link_pattern, headers=HEADERS): + for oei_link in await self.get_hyperlinks( + BASE_URL, oei_link_pattern, headers=HEADERS + ): self.logger.info(f"LEAD tool raw dataset: {oei_link}") year_links = {} oei_year = -1 @@ -41,7 +43,9 @@ async def get_resources(self) -> ArchiveAwaitable: oei_year = link_year else: if oei_year != link_year: - self.logger.warning(f"Mixed years found at {oei_link}: {oei_year}, {link_year} from {data_link}") + self.logger.warning( + f"Mixed years found at {oei_link}: {oei_year}, {link_year} from {data_link}" + ) self.logger.debug(f"OEI data: {data_link}") year_links[matches.group(1)] = data_link if year_links: @@ -70,5 +74,5 @@ async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceI return ResourceInfo( local_path=zip_path, partitions={"year": year}, - layout=ZipLayout(file_paths=data_paths_in_archive) + layout=ZipLayout(file_paths=data_paths_in_archive), ) From 51667b24114d2fdaeb44d7f7911c4b1e26766a40 Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Thu, 23 Jan 2025 16:10:55 -0500 Subject: [PATCH 06/19] [fix] Add missing import --- src/pudl_archiver/archivers/doelead.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py index c1b47847..7f1b93a8 100644 --- a/src/pudl_archiver/archivers/doelead.py +++ b/src/pudl_archiver/archivers/doelead.py @@ -7,6 +7,7 @@ ArchiveAwaitable, ResourceInfo, ) +from pudl_archiver.frictionless import ZipLayout BASE_URL = "https://www.energy.gov/scep/low-income-energy-affordability-data-lead-tool" From 020b3cdd4eb0557a61af926ccbc00ac02bc5f17c Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis <1158666+krivard@users.noreply.github.com> Date: Thu, 23 Jan 2025 16:22:39 -0500 Subject: [PATCH 07/19] [docs] Add more detail to doelead docstring --- src/pudl_archiver/archivers/doelead.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py index b1b854b4..76dbb79b 100644 --- a/src/pudl_archiver/archivers/doelead.py +++ b/src/pudl_archiver/archivers/doelead.py @@ -1,4 +1,25 @@ -"""Download DOE LEAD data.""" +"""Download DOE LEAD data. + +Each partition includes: +- Data Dictionary +- Census Tracts List +- Cities List +- Counties List +- States List +- Tribal Areas List +- Cities Census Track Overlaps +- Tribal Areas Tract Overlaps +- One .zip file per state, each of which includes: + - AMI Census Tracts + - SMI Census Tracts + - LLSI Census Tracts + - FPL Census Tracts + - LLSI Counties + - SMI Counties + - FPL Counties + - AMI Counties +""" + import re From 3583b4e587bae4ed9288785d58eb1332b25e29f7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 23 Jan 2025 21:23:48 +0000 Subject: [PATCH 08/19] [pre-commit.ci] auto fixes from pre-commit.com hooks For more information, see https://pre-commit.ci --- src/pudl_archiver/archivers/doelead.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py index 76dbb79b..88467682 100644 --- a/src/pudl_archiver/archivers/doelead.py +++ b/src/pudl_archiver/archivers/doelead.py @@ -20,7 +20,6 @@ - AMI Counties """ - import re from pudl_archiver.archivers.classes import ( From de0ba16da3406b3493633843908e748eac434c7a Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Fri, 24 Jan 2025 16:58:20 -0500 Subject: [PATCH 09/19] [fix] switch to hard-coded DOIs for known releases, check LEAD Tool page for new releases, add better docstrings --- src/pudl_archiver/archivers/doelead.py | 83 ++++++++++++++++++-------- 1 file changed, 59 insertions(+), 24 deletions(-) diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py index 88467682..a1270d7a 100644 --- a/src/pudl_archiver/archivers/doelead.py +++ b/src/pudl_archiver/archivers/doelead.py @@ -29,7 +29,11 @@ ) from pudl_archiver.frictionless import ZipLayout -BASE_URL = "https://www.energy.gov/scep/low-income-energy-affordability-data-lead-tool" +TOOL_URL = "https://www.energy.gov/scep/low-income-energy-affordability-data-lead-tool" +YEARS_DOIS = { + 2022: "https://doi.org/10.25984/2504170", + 2018: "https://doi.org/10.25984/1784729", +} # verified working 2025-01-22 via # $ wget "https://www.energy.gov/scep/low-income-energy-affordability-data-lead-tool" -O foo.html -U "Mozilla/5.0 Catalyst/2025 Cooperative/2025" @@ -42,40 +46,71 @@ class DoeLeadArchiver(AbstractDatasetArchiver): name = "doelead" async def get_resources(self) -> ArchiveAwaitable: - """Download DOE LEAD resources.""" + """Download DOE LEAD resources. + + The DOE LEAD Tool doesn't provide direct access to the raw data, but instead links to the current raw data release hosted on OEDI. It does not provide links to past data releases. So, we hard-code the DOIs for all known releases, archive those, but also check the DOE LEAD Tool page to see if there's a new release we don't know about yet. + """ # e.g.: https://data.openei.org/submissions/6219 - oei_link_pattern = re.compile(r"data\.openei\.org/submissions") + currentrelease_link_pattern = re.compile(r"data\.openei\.org/submissions") + """Regex for matching the current raw data release on the DOE LEAD Tool page""" + + doi_link_pattern = re.compile(r"https://doi.org") + """Regex for matching the DOI of the OEDI submission""" + # e.g.: https://data.openei.org/files/6219/DC-2022-LEAD-data.zip # https://data.openei.org/files/6219/Data%20Dictionary%202022.xlsx # https://data.openei.org/files/6219/LEAD%20Tool%20States%20List%202022.xlsx data_link_pattern = re.compile(r"([^/]+(\d{4})(?:-LEAD-data.zip|.xlsx))") - for oei_link in await self.get_hyperlinks( - BASE_URL, oei_link_pattern, headers=HEADERS - ): - self.logger.info(f"LEAD tool raw dataset: {oei_link}") - year_links = {} - oei_year = -1 - for data_link in await self.get_hyperlinks(oei_link, data_link_pattern): + """Regex for matching the data files in a release on the OEDI page. Captures the year, and supports both .zip and .xlsx file names.""" + + currentrelease_link = await self.get_hyperlinks( + TOOL_URL, currentrelease_link_pattern, headers=HEADERS + ) + if len(currentrelease_link) != 1: + raise AssertionError( + f"We expect exactly one outgoing link to data.openei.org/submissions at {BASE_URL}, but we found: {currentrelease_link}" + ) + currentrelease_link = currentrelease_link.pop() + currentrelease_doi = await self.get_hyperlinks(currentrelease_link, doi_link_pattern) + if len(currentrelease_doi) != 1: + raise AssertionError( + f"We expect exactly one DOI link at {currentrelease_link}, but we found: {currentrelease_doi}" + ) + currentrelease_doi = currentrelease_doi.pop() + + currentrelease_found = False + for year, doi in YEARS_DOIS.items(): + self.logger.info(f"Processing DOE LEAD raw data release for {year}: {doi}") + if doi == currentrelease_doi: + currentrelease_found = True + filenames_links = {} + for data_link in await self.get_hyperlinks(doi, data_link_pattern): matches = data_link_pattern.search(data_link) if not matches: continue link_year = int(matches.group(2)) - if oei_year < 0: - oei_year = link_year - else: - if oei_year != link_year: - self.logger.warning( - f"Mixed years found at {oei_link}: {oei_year}, {link_year} from {data_link}" - ) - self.logger.debug(f"OEI data: {data_link}") - year_links[matches.group(1)] = data_link - if year_links: - self.logger.info(f"Downloading: {oei_year}, {len(year_links)} items") - yield self.get_year_resource(year_links, oei_year) - self.logger.info("ALL DONE") + if link_year != year: + raise AssertionError( + f"We expect all files at {doi} to be for {year}, but we found: {link_year} from {data_link}" + ) + filenames_links[matches.group(1)] = data_link + if filenames_links: + self.logger.info(f"Downloading: {year}, {len(filenames_links)} items") + yield self.get_year_resource(filenames_links, year) + if not currentrelease_found: + raise AssertionError( + f"New DOE LEAD raw data release detected at {currentrelease_doi}. Update the archiver to process it." + ) async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceInfo: - """Download zip file.""" + """Download all available data for a year. + + Resulting resource contains one zip file of CSVs per state/territory, plus a handful of .xlsx dictionary and geocoding files. + + Args: + links: filename->URL mapping for files to download + year: the year we're downloading data for + """ host = "https://data.openei.org" zip_path = self.download_directory / f"doelead-{year}.zip" data_paths_in_archive = set() From 96b064d2977768bf12b10cdc667ac293a4ae47e7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Jan 2025 21:58:45 +0000 Subject: [PATCH 10/19] [pre-commit.ci] auto fixes from pre-commit.com hooks For more information, see https://pre-commit.ci --- src/pudl_archiver/archivers/doelead.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py index a1270d7a..eada09f8 100644 --- a/src/pudl_archiver/archivers/doelead.py +++ b/src/pudl_archiver/archivers/doelead.py @@ -47,22 +47,22 @@ class DoeLeadArchiver(AbstractDatasetArchiver): async def get_resources(self) -> ArchiveAwaitable: """Download DOE LEAD resources. - + The DOE LEAD Tool doesn't provide direct access to the raw data, but instead links to the current raw data release hosted on OEDI. It does not provide links to past data releases. So, we hard-code the DOIs for all known releases, archive those, but also check the DOE LEAD Tool page to see if there's a new release we don't know about yet. """ # e.g.: https://data.openei.org/submissions/6219 currentrelease_link_pattern = re.compile(r"data\.openei\.org/submissions") """Regex for matching the current raw data release on the DOE LEAD Tool page""" - + doi_link_pattern = re.compile(r"https://doi.org") """Regex for matching the DOI of the OEDI submission""" - + # e.g.: https://data.openei.org/files/6219/DC-2022-LEAD-data.zip # https://data.openei.org/files/6219/Data%20Dictionary%202022.xlsx # https://data.openei.org/files/6219/LEAD%20Tool%20States%20List%202022.xlsx data_link_pattern = re.compile(r"([^/]+(\d{4})(?:-LEAD-data.zip|.xlsx))") """Regex for matching the data files in a release on the OEDI page. Captures the year, and supports both .zip and .xlsx file names.""" - + currentrelease_link = await self.get_hyperlinks( TOOL_URL, currentrelease_link_pattern, headers=HEADERS ) @@ -71,13 +71,15 @@ async def get_resources(self) -> ArchiveAwaitable: f"We expect exactly one outgoing link to data.openei.org/submissions at {BASE_URL}, but we found: {currentrelease_link}" ) currentrelease_link = currentrelease_link.pop() - currentrelease_doi = await self.get_hyperlinks(currentrelease_link, doi_link_pattern) + currentrelease_doi = await self.get_hyperlinks( + currentrelease_link, doi_link_pattern + ) if len(currentrelease_doi) != 1: raise AssertionError( f"We expect exactly one DOI link at {currentrelease_link}, but we found: {currentrelease_doi}" ) currentrelease_doi = currentrelease_doi.pop() - + currentrelease_found = False for year, doi in YEARS_DOIS.items(): self.logger.info(f"Processing DOE LEAD raw data release for {year}: {doi}") @@ -104,9 +106,9 @@ async def get_resources(self) -> ArchiveAwaitable: async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceInfo: """Download all available data for a year. - + Resulting resource contains one zip file of CSVs per state/territory, plus a handful of .xlsx dictionary and geocoding files. - + Args: links: filename->URL mapping for files to download year: the year we're downloading data for From e46f6d9321580da5034adee0b401c91346a951e5 Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Fri, 24 Jan 2025 17:03:58 -0500 Subject: [PATCH 11/19] [fix] missing refactor in fstring --- src/pudl_archiver/archivers/doelead.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py index a1270d7a..53c8a895 100644 --- a/src/pudl_archiver/archivers/doelead.py +++ b/src/pudl_archiver/archivers/doelead.py @@ -68,7 +68,7 @@ async def get_resources(self) -> ArchiveAwaitable: ) if len(currentrelease_link) != 1: raise AssertionError( - f"We expect exactly one outgoing link to data.openei.org/submissions at {BASE_URL}, but we found: {currentrelease_link}" + f"We expect exactly one outgoing link to data.openei.org/submissions at {TOOL_URL}, but we found: {currentrelease_link}" ) currentrelease_link = currentrelease_link.pop() currentrelease_doi = await self.get_hyperlinks(currentrelease_link, doi_link_pattern) From ac73e65879324af0270d4259c9c857e4cf2bc8fc Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 28 Jan 2025 15:51:44 -0500 Subject: [PATCH 12/19] Drop site that no longer exists, fix class --- src/pudl_archiver/archivers/classes.py | 2 +- src/pudl_archiver/archivers/doelead.py | 42 ++++++-------------------- 2 files changed, 10 insertions(+), 34 deletions(-) diff --git a/src/pudl_archiver/archivers/classes.py b/src/pudl_archiver/archivers/classes.py index 9f20d2d7..4d309ea1 100644 --- a/src/pudl_archiver/archivers/classes.py +++ b/src/pudl_archiver/archivers/classes.py @@ -260,7 +260,7 @@ async def get_hyperlinks( # Warn if no links are found if not hyperlinks: self.logger.warning( - f"The archiver couldn't find any hyperlinks{('that match' + filter_pattern) if filter_pattern else ''}." + f"The archiver couldn't find any hyperlinks{('that match: ' + filter_pattern.pattern) if filter_pattern else ''}." f"Make sure your filter_pattern is correct, check if the structure of the {url} page changed, or if you are missing HTTP headers." ) diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py index 4f86d37c..3bfcb8eb 100644 --- a/src/pudl_archiver/archivers/doelead.py +++ b/src/pudl_archiver/archivers/doelead.py @@ -29,7 +29,9 @@ ) from pudl_archiver.frictionless import ZipLayout -TOOL_URL = "https://www.energy.gov/scep/low-income-energy-affordability-data-lead-tool" +# This site is no longer online as of 01/28/2025. +# TOOL_URL = "https://www.energy.gov/scep/low-income-energy-affordability-data-lead-tool" + YEARS_DOIS = { 2022: "https://doi.org/10.25984/2504170", 2018: "https://doi.org/10.25984/1784729", @@ -48,43 +50,21 @@ class DoeLeadArchiver(AbstractDatasetArchiver): async def get_resources(self) -> ArchiveAwaitable: """Download DOE LEAD resources. - The DOE LEAD Tool doesn't provide direct access to the raw data, but instead links to the current raw data release hosted on OEDI. It does not provide links to past data releases. So, we hard-code the DOIs for all known releases, archive those, but also check the DOE LEAD Tool page to see if there's a new release we don't know about yet. + The DOE LEAD Tool is down as of 01/28/2025. It didn't provide direct access + to the raw data, but instead linked to the current raw data release hosted on + OEDI. It did not provide links to past data releases. So, we hard-code the + DOIs for all known releases and archive those. Based on the removal of the main + page, it's safe to assume this won't be updated any time soon. If it is, we'll + need to manually update the DOIs. """ - # e.g.: https://data.openei.org/submissions/6219 - currentrelease_link_pattern = re.compile(r"data\.openei\.org/submissions") - """Regex for matching the current raw data release on the DOE LEAD Tool page""" - - doi_link_pattern = re.compile(r"https://doi.org") - """Regex for matching the DOI of the OEDI submission""" - # e.g.: https://data.openei.org/files/6219/DC-2022-LEAD-data.zip # https://data.openei.org/files/6219/Data%20Dictionary%202022.xlsx # https://data.openei.org/files/6219/LEAD%20Tool%20States%20List%202022.xlsx data_link_pattern = re.compile(r"([^/]+(\d{4})(?:-LEAD-data.zip|.xlsx))") """Regex for matching the data files in a release on the OEDI page. Captures the year, and supports both .zip and .xlsx file names.""" - currentrelease_link = await self.get_hyperlinks( - TOOL_URL, currentrelease_link_pattern, headers=HEADERS - ) - if len(currentrelease_link) != 1: - raise AssertionError( - f"We expect exactly one outgoing link to data.openei.org/submissions at {TOOL_URL}, but we found: {currentrelease_link}" - ) - currentrelease_link = currentrelease_link.pop() - currentrelease_doi = await self.get_hyperlinks( - currentrelease_link, doi_link_pattern - ) - if len(currentrelease_doi) != 1: - raise AssertionError( - f"We expect exactly one DOI link at {currentrelease_link}, but we found: {currentrelease_doi}" - ) - currentrelease_doi = currentrelease_doi.pop() - - currentrelease_found = False for year, doi in YEARS_DOIS.items(): self.logger.info(f"Processing DOE LEAD raw data release for {year}: {doi}") - if doi == currentrelease_doi: - currentrelease_found = True filenames_links = {} for data_link in await self.get_hyperlinks(doi, data_link_pattern): matches = data_link_pattern.search(data_link) @@ -99,10 +79,6 @@ async def get_resources(self) -> ArchiveAwaitable: if filenames_links: self.logger.info(f"Downloading: {year}, {len(filenames_links)} items") yield self.get_year_resource(filenames_links, year) - if not currentrelease_found: - raise AssertionError( - f"New DOE LEAD raw data release detected at {currentrelease_doi}. Update the archiver to process it." - ) async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceInfo: """Download all available data for a year. From 9a2a14903df27d15352520dfef39dd043d49f178 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 28 Jan 2025 17:20:36 -0500 Subject: [PATCH 13/19] Download methodology PDFs --- src/pudl_archiver/archivers/doelead.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py index 3bfcb8eb..caa9901c 100644 --- a/src/pudl_archiver/archivers/doelead.py +++ b/src/pudl_archiver/archivers/doelead.py @@ -80,6 +80,14 @@ async def get_resources(self) -> ArchiveAwaitable: self.logger.info(f"Downloading: {year}, {len(filenames_links)} items") yield self.get_year_resource(filenames_links, year) + # Download LEAD methodology PDF and other metadata separately + metadata_links = { + "lead-methodology-122024.pdf": "https://www.energy.gov/sites/default/files/2024-12/lead-methodology_122024.pdf", + "lead-tool-factsheet-072624.pdf": "https://www.energy.gov/sites/default/files/2024-07/lead-tool-factsheet_072624.pdf", + } + for filename, link in metadata_links.items(): + yield self.get_metadata_resource(filename=filename, link=link) + async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceInfo: """Download all available data for a year. @@ -110,3 +118,21 @@ async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceI partitions={"year": year}, layout=ZipLayout(file_paths=data_paths_in_archive), ) + + async def get_metadata_resource(self, filename: str, link: str) -> ResourceInfo: + """Download metadata resource. + + Resulting resource contains one PDF file with metadata about the LEAD dataset. + + Args: + links: filename->URL mapping for files to download + year: the year we're downloading data for + """ + self.logger.info(f"Downloading {link}") + download_path = self.download_directory / filename + await self.download_file(link, download_path) + + return ResourceInfo( + local_path=download_path, + partitions={}, + ) From 99ca4f4fe3d22e7a22432d1b9552e27a460c9c96 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 28 Jan 2025 17:47:09 -0500 Subject: [PATCH 14/19] Add PDF metadata to archive, add placeholder DOI --- src/pudl_archiver/archivers/doelead.py | 42 +++++++++---------- .../package_data/zenodo_doi.yaml | 2 + 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py index caa9901c..dd95542b 100644 --- a/src/pudl_archiver/archivers/doelead.py +++ b/src/pudl_archiver/archivers/doelead.py @@ -20,8 +20,6 @@ - AMI Counties """ -import re - from pudl_archiver.archivers.classes import ( AbstractDatasetArchiver, ArchiveAwaitable, @@ -60,25 +58,25 @@ async def get_resources(self) -> ArchiveAwaitable: # e.g.: https://data.openei.org/files/6219/DC-2022-LEAD-data.zip # https://data.openei.org/files/6219/Data%20Dictionary%202022.xlsx # https://data.openei.org/files/6219/LEAD%20Tool%20States%20List%202022.xlsx - data_link_pattern = re.compile(r"([^/]+(\d{4})(?:-LEAD-data.zip|.xlsx))") - """Regex for matching the data files in a release on the OEDI page. Captures the year, and supports both .zip and .xlsx file names.""" - - for year, doi in YEARS_DOIS.items(): - self.logger.info(f"Processing DOE LEAD raw data release for {year}: {doi}") - filenames_links = {} - for data_link in await self.get_hyperlinks(doi, data_link_pattern): - matches = data_link_pattern.search(data_link) - if not matches: - continue - link_year = int(matches.group(2)) - if link_year != year: - raise AssertionError( - f"We expect all files at {doi} to be for {year}, but we found: {link_year} from {data_link}" - ) - filenames_links[matches.group(1)] = data_link - if filenames_links: - self.logger.info(f"Downloading: {year}, {len(filenames_links)} items") - yield self.get_year_resource(filenames_links, year) + # data_link_pattern = re.compile(r"([^/]+(\d{4})(?:-LEAD-data.zip|.xlsx))") + # """Regex for matching the data files in a release on the OEDI page. Captures the year, and supports both .zip and .xlsx file names.""" + + # for year, doi in YEARS_DOIS.items(): + # self.logger.info(f"Processing DOE LEAD raw data release for {year}: {doi}") + # filenames_links = {} + # for data_link in await self.get_hyperlinks(doi, data_link_pattern): + # matches = data_link_pattern.search(data_link) + # if not matches: + # continue + # link_year = int(matches.group(2)) + # if link_year != year: + # raise AssertionError( + # f"We expect all files at {doi} to be for {year}, but we found: {link_year} from {data_link}" + # ) + # filenames_links[matches.group(1)] = data_link + # if filenames_links: + # self.logger.info(f"Downloading: {year}, {len(filenames_links)} items") + # yield self.get_year_resource(filenames_links, year) # Download LEAD methodology PDF and other metadata separately metadata_links = { @@ -130,7 +128,7 @@ async def get_metadata_resource(self, filename: str, link: str) -> ResourceInfo: """ self.logger.info(f"Downloading {link}") download_path = self.download_directory / filename - await self.download_file(link, download_path) + await self.download_file(url=link, file_path=download_path, headers=HEADERS) return ResourceInfo( local_path=download_path, diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml index 4eb77c28..6ad1645c 100644 --- a/src/pudl_archiver/package_data/zenodo_doi.yaml +++ b/src/pudl_archiver/package_data/zenodo_doi.yaml @@ -7,6 +7,8 @@ censuspep: doeiraec: production_doi: 10.5281/zenodo.14757121 sandbox_doi: 10.5072/zenodo.157934 +doelead: + #production_doi: 10.5281/zenodo.14757121 eia176: production_doi: 10.5281/zenodo.7682357 sandbox_doi: 10.5072/zenodo.3158 From 283db91632e7cf8cf54b6d5cd2182e25d625ed93 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 28 Jan 2025 17:48:48 -0500 Subject: [PATCH 15/19] Restore entire archiving workflow --- src/pudl_archiver/archivers/doelead.py | 40 ++++++++++++++------------ 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/src/pudl_archiver/archivers/doelead.py b/src/pudl_archiver/archivers/doelead.py index dd95542b..d639c198 100644 --- a/src/pudl_archiver/archivers/doelead.py +++ b/src/pudl_archiver/archivers/doelead.py @@ -20,6 +20,8 @@ - AMI Counties """ +import re + from pudl_archiver.archivers.classes import ( AbstractDatasetArchiver, ArchiveAwaitable, @@ -58,25 +60,25 @@ async def get_resources(self) -> ArchiveAwaitable: # e.g.: https://data.openei.org/files/6219/DC-2022-LEAD-data.zip # https://data.openei.org/files/6219/Data%20Dictionary%202022.xlsx # https://data.openei.org/files/6219/LEAD%20Tool%20States%20List%202022.xlsx - # data_link_pattern = re.compile(r"([^/]+(\d{4})(?:-LEAD-data.zip|.xlsx))") - # """Regex for matching the data files in a release on the OEDI page. Captures the year, and supports both .zip and .xlsx file names.""" - - # for year, doi in YEARS_DOIS.items(): - # self.logger.info(f"Processing DOE LEAD raw data release for {year}: {doi}") - # filenames_links = {} - # for data_link in await self.get_hyperlinks(doi, data_link_pattern): - # matches = data_link_pattern.search(data_link) - # if not matches: - # continue - # link_year = int(matches.group(2)) - # if link_year != year: - # raise AssertionError( - # f"We expect all files at {doi} to be for {year}, but we found: {link_year} from {data_link}" - # ) - # filenames_links[matches.group(1)] = data_link - # if filenames_links: - # self.logger.info(f"Downloading: {year}, {len(filenames_links)} items") - # yield self.get_year_resource(filenames_links, year) + data_link_pattern = re.compile(r"([^/]+(\d{4})(?:-LEAD-data.zip|.xlsx))") + """Regex for matching the data files in a release on the OEDI page. Captures the year, and supports both .zip and .xlsx file names.""" + + for year, doi in YEARS_DOIS.items(): + self.logger.info(f"Processing DOE LEAD raw data release for {year}: {doi}") + filenames_links = {} + for data_link in await self.get_hyperlinks(doi, data_link_pattern): + matches = data_link_pattern.search(data_link) + if not matches: + continue + link_year = int(matches.group(2)) + if link_year != year: + raise AssertionError( + f"We expect all files at {doi} to be for {year}, but we found: {link_year} from {data_link}" + ) + filenames_links[matches.group(1)] = data_link + if filenames_links: + self.logger.info(f"Downloading: {year}, {len(filenames_links)} items") + yield self.get_year_resource(filenames_links, year) # Download LEAD methodology PDF and other metadata separately metadata_links = { From 1f73afb824a158e5cc37ac78f708796fd00deb8e Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 28 Jan 2025 18:20:18 -0500 Subject: [PATCH 16/19] Update production DOI --- src/pudl_archiver/package_data/zenodo_doi.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml index 6ad1645c..a8a85a64 100644 --- a/src/pudl_archiver/package_data/zenodo_doi.yaml +++ b/src/pudl_archiver/package_data/zenodo_doi.yaml @@ -8,7 +8,8 @@ doeiraec: production_doi: 10.5281/zenodo.14757121 sandbox_doi: 10.5072/zenodo.157934 doelead: - #production_doi: 10.5281/zenodo.14757121 + production_doi: 10.5281/zenodo.14758684 + # sandbox_doi: TODO once server 413 error resolves eia176: production_doi: 10.5281/zenodo.7682357 sandbox_doi: 10.5072/zenodo.3158 From f87e9f75599ede2b004dbcb7e453681106b24be9 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Wed, 29 Jan 2025 11:59:15 -0500 Subject: [PATCH 17/19] Add to GHA --- .github/workflows/run-archiver.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-archiver.yml b/.github/workflows/run-archiver.yml index a201046a..c0616e78 100644 --- a/.github/workflows/run-archiver.yml +++ b/.github/workflows/run-archiver.yml @@ -6,7 +6,7 @@ on: inputs: datasets: description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").' - default: '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' + default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' required: true type: string create_github_issue: @@ -26,7 +26,7 @@ jobs: strategy: matrix: # Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here. - dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }} + dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }} fail-fast: false runs-on: ubuntu-latest permissions: From 285479d82f2e1b9050046bce42be34bc922ba3a7 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Wed, 29 Jan 2025 12:00:06 -0500 Subject: [PATCH 18/19] Oops also MECS --- .github/workflows/run-archiver.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-archiver.yml b/.github/workflows/run-archiver.yml index c0616e78..25cf1e84 100644 --- a/.github/workflows/run-archiver.yml +++ b/.github/workflows/run-archiver.yml @@ -6,7 +6,7 @@ on: inputs: datasets: description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").' - default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' + default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' required: true type: string create_github_issue: @@ -26,7 +26,7 @@ jobs: strategy: matrix: # Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here. - dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }} + dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }} fail-fast: false runs-on: ubuntu-latest permissions: From b9cbbe5e220b7765986b7792fff5de75fa9db521 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Wed, 29 Jan 2025 12:01:45 -0500 Subject: [PATCH 19/19] Fix bad merge resolution --- .github/workflows/run-archiver.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-archiver.yml b/.github/workflows/run-archiver.yml index 25cf1e84..c1b36888 100644 --- a/.github/workflows/run-archiver.yml +++ b/.github/workflows/run-archiver.yml @@ -6,7 +6,7 @@ on: inputs: datasets: description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").' - default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' + default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' required: true type: string create_github_issue: @@ -26,7 +26,7 @@ jobs: strategy: matrix: # Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here. - dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }} + dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiamecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","vcerare"' )) }} fail-fast: false runs-on: ubuntu-latest permissions: