Skip to content

Commit

Permalink
Download methodology PDFs
Browse files Browse the repository at this point in the history
  • Loading branch information
e-belfer committed Jan 28, 2025
1 parent ac73e65 commit 9a2a149
Showing 1 changed file with 26 additions and 0 deletions.
26 changes: 26 additions & 0 deletions src/pudl_archiver/archivers/doelead.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,14 @@ async def get_resources(self) -> ArchiveAwaitable:
self.logger.info(f"Downloading: {year}, {len(filenames_links)} items")
yield self.get_year_resource(filenames_links, year)

# Download LEAD methodology PDF and other metadata separately
metadata_links = {
"lead-methodology-122024.pdf": "https://www.energy.gov/sites/default/files/2024-12/lead-methodology_122024.pdf",
"lead-tool-factsheet-072624.pdf": "https://www.energy.gov/sites/default/files/2024-07/lead-tool-factsheet_072624.pdf",
}
for filename, link in metadata_links.items():
yield self.get_metadata_resource(filename=filename, link=link)

async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceInfo:
"""Download all available data for a year.
Expand Down Expand Up @@ -110,3 +118,21 @@ async def get_year_resource(self, links: dict[str, str], year: int) -> ResourceI
partitions={"year": year},
layout=ZipLayout(file_paths=data_paths_in_archive),
)

async def get_metadata_resource(self, filename: str, link: str) -> ResourceInfo:
"""Download metadata resource.
Resulting resource contains one PDF file with metadata about the LEAD dataset.
Args:
links: filename->URL mapping for files to download
year: the year we're downloading data for
"""
self.logger.info(f"Downloading {link}")
download_path = self.download_directory / filename
await self.download_file(link, download_path)

return ResourceInfo(
local_path=download_path,
partitions={},
)

0 comments on commit 9a2a149

Please sign in to comment.