catalyst-cooperative · e-belfer · Oct 17, 2024 · Sep 30, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/src/pudl/etl/__init__.py b/src/pudl/etl/__init__.py
@@ -59,6 +59,7 @@
     "raw_gridpathratoolkit": [pudl.extract.gridpathratoolkit],
     "raw_phmsagas": [pudl.extract.phmsagas],
     "raw_nrelatb": [pudl.extract.nrelatb],
+    "raw_vceregen": [pudl.extract.vceregen],
 }
 
 

diff --git a/src/pudl/extract/__init__.py b/src/pudl/extract/__init__.py
@@ -26,5 +26,6 @@
     gridpathratoolkit,
     nrelatb,
     phmsagas,
+    vceregen,
     xbrl,
 )
diff --git a/src/pudl/extract/extractor.py b/src/pudl/extract/extractor.py
@@ -5,6 +5,7 @@
 from collections import defaultdict
 from typing import Any
 
+import dask.dataframe as dd
 import pandas as pd
 from dagster import (
     AssetsDefinition,
@@ -13,6 +14,7 @@
     DynamicOutput,
     In,
     OpDefinition,
+    Out,
     TypeCheckContext,
     graph_asset,
     op,
@@ -22,9 +24,63 @@
 
 StrInt = str | int
 PartitionSelection = list[StrInt] | tuple[StrInt] | StrInt
+DataframeType = pd.DataFrame | dd.DataFrame
 
 logger = pudl.logging_helpers.get_logger(__name__)
 
+# Define some custom dagster data types
+# 2024-03-27: Dagster can't automatically convert union types within
+# parametrized types; we have to write our own custom DagsterType for now.
+
+
+def _is_dict_str_strint(_context: TypeCheckContext, x: Any) -> bool:
+    if not isinstance(x, dict):
+        return False
+    for key, value in x.items():
+        if not isinstance(key, str):
+            return False
+        if not isinstance(value, str | int):
+            return False
+    return True
+
+
+dagster_dict_str_strint = DagsterType(
+    name="dict[str, str | int]", type_check_fn=_is_dict_str_strint
+)
+
+
+def _is_dict_str_dataframe(_context: TypeCheckContext, x: Any) -> bool:
+    if not isinstance(x, dict):
+        return False
+    for key, value in x.items():
+        if not isinstance(key, str):
+            return False
+        if not isinstance(value, DataframeType):
+            return False
+    return True
+
+
+dataframe_dagster_type = DagsterType(
+    name="DataFrame Type Check", type_check_fn=_is_dict_str_dataframe
+)
+
+
+def _is_list_dict_str_dataframe(_context: TypeCheckContext, x: Any) -> bool:
+    if not isinstance(x, list):
+        return False
+    for item in x:
+        for key, value in item.items():
+            if not isinstance(key, str):
+                return False
+            if not isinstance(value, DataframeType):
+                return False
+    return True
+
+
+list_dataframe_dagster_type = DagsterType(
+    name="List DataFrame Type Check", type_check_fn=_is_list_dict_str_dataframe
+)
+
 
 class GenericMetadata:
     """Load generic metadata from Python package data.
@@ -197,7 +253,7 @@ def validate(self, df: pd.DataFrame, page: str, **partition: PartitionSelection)
                     f"\n{missing_raw_cols}"
                 )
 
-    def process_final_page(self, df: pd.DataFrame, page: str) -> pd.DataFrame:
+    def process_final_page(self, df: DataframeType, page: str) -> DataframeType:
         """Final processing stage applied to a page DataFrame."""
         return df
 
@@ -214,7 +270,7 @@ def combine(self, dfs: list[pd.DataFrame], page: str) -> pd.DataFrame:
 
         return self.process_final_page(df, page)
 
-    def extract(self, **partitions: PartitionSelection) -> dict[str, pd.DataFrame]:
+    def extract(self, **partitions: PartitionSelection) -> dict[str, DataframeType]:
         """Extracts dataframes.
 
         Returns dict where keys are page names and values are
@@ -243,6 +299,7 @@ def extract(self, **partitions: PartitionSelection) -> dict[str, pd.DataFrame]:
             current_page_dfs = [
                 pd.DataFrame(),
             ]
+
             for partition in pudl.helpers.iterate_multivalue_dict(**partitions):
                 # we are going to skip
                 if self.source_filename(page, **partition) == "-1":
@@ -262,8 +319,12 @@ def extract(self, **partitions: PartitionSelection) -> dict[str, pd.DataFrame]:
         return all_page_dfs
 
 
-@op(tags={"memory-use": "high"})
-def concat_pages(paged_dfs: list[dict[str, pd.DataFrame]]) -> dict[str, pd.DataFrame]:
+@op(
+    tags={"memory-use": "high"},
+    ins={"paged_dfs": In(dagster_type=list[dataframe_dagster_type])},
+    out=Out(dagster_type=dataframe_dagster_type),
+)
+def concat_pages(paged_dfs: list[dict[str, DataframeType]]) -> dict[str, DataframeType]:
     """Concatenate similar pages of data from different years into single dataframes.
 
     Transform a list of dictionaries of dataframes into a single dictionary of
@@ -284,39 +345,33 @@ def concat_pages(paged_dfs: list[dict[str, pd.DataFrame]]) -> dict[str, pd.DataF
         A dictionary of DataFrames keyed by page name, where the DataFrame contains that
         page's data from all extracted years concatenated together.
     """
+    # Figure out what's in each dataframe.
+    dtypes = [type(item) for dictionary in paged_dfs for item in dictionary.values()]
+
     # Transform the list of dictionaries of dataframes into a dictionary of lists of
     # dataframes, in which all dataframes in each list represent different instances of
     # the same page of data from different years
+
     all_data = defaultdict(list)
     for dfs in paged_dfs:
         for page in dfs:
             all_data[page].append(dfs[page])
 
     # concatenate the dataframes in each list in the dictionary into a single dataframe
-    for page in all_data:
-        all_data[page] = pd.concat(all_data[page]).reset_index(drop=True)
+    if all(x == pd.DataFrame for x in dtypes):  # If all dfs are pandas dfs
+        logger.warn("Concatenating pandas dataframes.")
+        for page in all_data:
+            all_data[page] = pd.concat(all_data[page]).reset_index(drop=True)
+    elif all(x == dd.DataFrame for x in dtypes):  # If all dfs are dask dfs
+        logger.warn("Concatenating pandas dataframes.")
+        for page in all_data:
+            all_data[page] = dd.concat(all_data[page])
+    else:
+        raise AssertionError(f"Concatenation not supported for dtypes: {dtypes}")
 
     return all_data
 
 
-def _is_dict_str_strint(_context: TypeCheckContext, x: Any) -> bool:
-    if not isinstance(x, dict):
-        return False
-    for key, value in x.items():
-        if not isinstance(key, str):
-            return False
-        if not isinstance(value, str | int):
-            return False
-    return True
-
-
-# 2024-03-27: Dagster can't automatically convert union types within
-# parametrized types; we have to write our own custom DagsterType for now.
-dagster_dict_str_strint = DagsterType(
-    name="dict[str, str | int]", type_check_fn=_is_dict_str_strint
-)
-
-
 def partition_extractor_factory(
     extractor_cls: type[GenericExtractor], name: str
 ) -> OpDefinition:
@@ -331,10 +386,11 @@ def partition_extractor_factory(
         required_resource_keys={"datastore"},
         name=f"extract_single_{name}_partition",
         ins={"part_dict": In(dagster_type=dagster_dict_str_strint)},
+        out=Out(dagster_type=dataframe_dagster_type),
     )
     def extract_single_partition(
         context, part_dict: dict[str, str | int]
-    ) -> dict[str, pd.DataFrame]:
+    ) -> dict[str, DataframeType]:
         """A function that extracts a year of spreadsheet data from an Excel file.
 
         This function will be decorated with a Dagster op and returned.

diff --git a/src/pudl/extract/vceregen.py b/src/pudl/extract/vceregen.py
@@ -0,0 +1,183 @@
+"""Extract VCE renewable generation profile data from CSVs.
+
+This dataset has 1,000s of columns, so we don't want to manually specify a rename on
+import because we'll pivot these to a column. We adapt the standard extraction
+infrastructure to simply read in the data.
+
+Each annual zip folder contains a folder with three files:
+Wind_Power_140m_Offshore_county.csv
+Wind_Power_100m_Onshore_county.csv
+Fixed_SolarPV_Lat_UPV_county.csv
+
+The drive also contains one more file: RA_county_lat_long_FIPS_table.csv. This file is
+not partitioned, so we always read it in regardless of the partitions configured for the
+run.
+"""
+
+from io import BytesIO
+
+import pandas as pd
+from dagster import AssetsDefinition, Output, asset
+
+from pudl import logging_helpers
+from pudl.extract.csv import CsvExtractor
+from pudl.extract.extractor import GenericMetadata, PartitionSelection, raw_df_factory
+
+logger = logging_helpers.get_logger(__name__)
+
+VCEREGEN_PAGES = [
+    "offshore_wind_power_140m",
+    "onshore_wind_power_100m",
+    "fixed_solar_pv_lat_upv",
+]
+
+
+class VCEMetadata(GenericMetadata):
+    """Special metadata class for VCE renewable generation profiles."""
+
+    def __init__(self, *args, **kwargs):
+        """Initialize the module.
+
+        Args:
+            ds (:class:datastore.Datastore): Initialized datastore.
+        """
+        super().__init__(*args, **kwargs)
+        self._file_name = self._load_csv(self._pkg, "file_map.csv")
+
+    def get_all_pages(self) -> list[str]:
+        """Hard code the page names, which usually are pulled from column rename spreadsheets."""
+        return VCEREGEN_PAGES
+
+    def get_file_name(self, page, **partition):
+        """Returns file name of given partition and page."""
+        return self._file_name.loc[page, str(self._get_partition_selection(partition))]
+
+
+class Extractor(CsvExtractor):
+    """Extractor for VCE renewable generation profiles."""
+
+    def __init__(self, *args, **kwargs):
+        """Initialize the module.
+
+        Args:
+            ds (:class:datastore.Datastore): Initialized datastore.
+        """
+        self.METADATA = VCEMetadata("vceregen")
+        super().__init__(*args, **kwargs)
+
+    def get_column_map(self, page, **partition):
+        """Return empty dictionary, we don't rename these files."""
+        return {}
+
+    def source_filename(self, page: str, **partition: PartitionSelection) -> str:
+        """Produce the CSV file name as it will appear in the archive.
+
+        The files are nested in an additional folder with the year name inside of the
+        zipfile, so we add a prefix folder based on the yearly partition to the source
+        filename.
+
+        Args:
+            page: pudl name for the dataset contents, eg "boiler_generator_assn" or
+                "coal_stocks"
+            partition: partition to load. Examples:
+                {'year': 2009}
+                {'year_month': '2020-08'}
+
+        Returns:
+            string name of the CSV file
+        """
+        return f"{partition['year']}/{self._metadata.get_file_name(page, **partition)}"
+
+    def load_source(self, page: str, **partition: PartitionSelection) -> pd.DataFrame:
+        """Produce the dataframe object for the given partition.
+
+        Args:
+            page: pudl name for the dataset contents, eg "boiler_generator_assn" or
+                "data"
+            partition: partition to load. Examples:
+                {'year': 2009}
+                {'year_month': '2020-08'}
+
+        Returns:
+            pd.DataFrame instance containing CSV data
+        """
+        with (
+            self.ds.get_zipfile_resource(self._dataset_name, **partition) as zf,
+        ):
+            # # Get path to zipfile
+            # zippath = zf.filename
+            # Get list of file names in the zipfile
+            files = zf.namelist()
+            # Get the particular file of interest
+            file = next(
+                (x for x in files if self.source_filename(page, **partition) in x), None
+            )
+            # # Read it in using dask
+            df = pd.read_csv(BytesIO(zf.read(file)), **self.READ_CSV_KWARGS)
+
+        return df
+
+    def process_raw(
+        self, df: pd.DataFrame, page: str, **partition: PartitionSelection
+    ) -> pd.DataFrame:
+        """Append report year to df to distinguish data from other years."""
+        self.cols_added.append("report_year")
+        selection = self._metadata._get_partition_selection(partition)
+        return df.assign(report_year=selection)
+
+    def validate(
+        self, df: pd.DataFrame, page: str, **partition: PartitionSelection
+    ) -> pd.DataFrame:
+        """Skip this step, as we aren't renaming any columns."""
+        return df
+
+    def combine(self, dfs: list[pd.DataFrame], page: str) -> pd.DataFrame:
+        """Concatenate dataframes into one, take any special steps for processing final page."""
+        # dfs = [dd.from_pandas(df, npartitions=2) for df in dfs]
+        # df = dd.concat(dfs)
+        # # TODO: Confirm that using pandas is preferable. Otherwise revert to this code.
+        df = pd.concat(dfs, sort=True, ignore_index=True)
+
+        return self.process_final_page(df, page)
+
+
+raw_vceregen__all_dfs = raw_df_factory(Extractor, name="vceregen")
+
+
+def raw_vceregen_asset_factory(part: str) -> AssetsDefinition:
+    """An asset factory for VCE hourly renewable generation profiles."""
+    asset_kwargs = {
+        "name": f"raw_vceregen__{part}",
+        "required_resource_keys": {"datastore", "dataset_settings"},
+        "compute_kind": "Python",
+    }
+
+    @asset(**asset_kwargs)
+    def _extract(context, raw_vceregen__all_dfs):
+        """Extract raw GridPath RA Toolkit renewable energy generation profiles.
+
+        Args:
+            context: dagster keyword that provides access to resources and config.
+        """
+        return Output(value=raw_vceregen__all_dfs[part])
+
+    return _extract
+
+
+raw_vceregen_assets = [raw_vceregen_asset_factory(part) for part in VCEREGEN_PAGES]
+
+
+@asset(required_resource_keys={"datastore", "dataset_settings"})
+def raw_vcegen__lat_lon_fips(context) -> pd.DataFrame:
+    """Extract lat/lon to FIPS and county mapping CSV.
+
+    This dataframe is static, so it has a distinct partition from the other datasets and
+    its extraction is controlled by a boolean in the ETL run.
+    """
+    ds = context.resources.datastore
+    partition_settings = context.resources.dataset_settings.vceregen
+    if partition_settings.fips:
+        return pd.read_csv(
+            BytesIO(ds.get_unique_resource("vceregen", fips=partition_settings.fips))
+        )
+    return pd.DataFrame()  # TODO: What makes sense here?