Merge pull request #369 from deployment-gap-model-education-fund/more…

…_docs More docs
deployment-gap-model-education-fund · Oct 30, 2024 · a4ec1a5 · a4ec1a5
2 parents 5d2b3cf + 3307e35
commit a4ec1a5
Show file tree

Hide file tree

Showing 10 changed files with 60 additions and 29 deletions.
diff --git a/src/dbcp/extract/eip_infrastructure.py b/src/dbcp/extract/eip_infrastructure.py
@@ -1,4 +1,10 @@
-"""Retrieve data from EIP Infrastructure spreadsheets for analysis."""
+"""Retrieve data from EIP Infrastructure spreadsheets for analysis.
+
+This data was updated by contacting EIP directly for the latest version, but now they
+host an Excel file at oilandgaswatch.org -> Resources -> Downloads, which points to:
+https://drive.google.com/drive/folders/1udtw3XeezA5Lkb8Mfc_cntNTcV4oPuKi
+Note that this new data version has changed structure from the one extracted below.
+"""
 from pathlib import Path
 from typing import Dict
 

diff --git a/src/dbcp/extract/epa_avert.py b/src/dbcp/extract/epa_avert.py
@@ -1,4 +1,4 @@
-"""Retrieve data from EIP Infrastructure spreadsheets for analysis."""
+"""Retrieve data from EPA AVERT avoided carbon modeling."""
 from pathlib import Path
 
 import pandas as pd

diff --git a/src/dbcp/extract/fips_tables.py b/src/dbcp/extract/fips_tables.py
@@ -1,4 +1,4 @@
-"""Extract canonical state and county FIPS tables from the addfips library."""
+"""Extract canonical state and county FIPS tables."""
 from functools import lru_cache
 from importlib.resources import files
 from typing import Dict
@@ -41,10 +41,7 @@ def extract_census_tribal_land(archive_uri: str) -> pd.DataFrame:
 
 
 def _extract_state_fips() -> pd.DataFrame:
-    """Extract canonical state and county FIPS tables from census data and the addfips library.
-
-    Args:
-        vintage (int, optional): which Census year to use. Defaults to FIPS_CODE_VINTAGE.
+    """Extract canonical state FIPS tables from the addfips library.
 
     Returns:
         Dict[str, pd.DataFrame]: output dictionary of dataframes
@@ -56,7 +53,7 @@ def _extract_state_fips() -> pd.DataFrame:
 
 
 def extract_fips(census_uri: str) -> Dict[str, pd.DataFrame]:
-    """Extract canonical state and county FIPS tables from census data and the addfips library.
+    """Extract canonical FIPS tables from census data and the addfips library.
 
     Returns:
         Dict[str, pd.DataFrame]: output dictionary of dataframes

diff --git a/src/dbcp/extract/lbnl_iso_queue.py b/src/dbcp/extract/lbnl_iso_queue.py
@@ -1,4 +1,4 @@
-"""Retrieve data from the 20201 LBNL ISO Queue spreadsheet for analysis."""
+"""Retrieve data from the LBNL ISO Queue spreadsheet."""
 from typing import Dict
 
 import pandas as pd
@@ -10,7 +10,7 @@ def extract(uri: str) -> Dict[str, pd.DataFrame]:
     """Read Excel file with LBNL ISO Queue dataset.
 
     Args:
-        uri: uri of data in GCS relatives to the root.
+        uri: uri of data in GCS relative to the root.
 
     Returns:
         dfs: dictionary of dataframe name to raw dataframe.

diff --git a/src/dbcp/extract/local_opposition.py b/src/dbcp/extract/local_opposition.py
@@ -1,4 +1,9 @@
-"""Extraction logic for Columbia Local Opposition dataset."""
+"""Extraction logic for Columbia Local Opposition dataset.
+
+This dataset is a .docx file with a hierarchical structure.  The hierarchy is denoted by
+formatting details (paragraph level, font, etc), but is surprisingly consistent. It is
+infrequently updated by a research group at Columbia University.
+"""
 from pathlib import Path
 from typing import Dict, List, Optional
 

diff --git a/src/dbcp/extract/protected_area_by_county.py b/src/dbcp/extract/protected_area_by_county.py
@@ -1,4 +1,11 @@
-"""Extract data from USGS PAD-US intersected with TIGER county shapefiles."""
+"""Extract data from USGS PAD-US intersected with TIGER county shapefiles.
+
+This data is derived from the Protected Areas Database of the United States (PAD-US) and
+intersected with TIGER county shapefiles. It was prototyped in a notebook but was never
+moved into a standalone module. The data loaded here is created in
+notebooks/23-tpb-check_federal_lands.ipynb. Ideally this data would be re-created in
+a module and loaded here, with a disk cache if necessary for performance.
+"""
 from pathlib import Path
 
 import pandas as pd

diff --git a/src/dbcp/extract/rmi_energy_communities.py b/src/dbcp/extract/rmi_energy_communities.py
@@ -1,4 +1,7 @@
-"""Extract data from RMI's energy communities analysis."""
+"""Extract data from RMI/Catalyst energy communities analysis.
+
+Source repo: https://github.com/catalyst-cooperative/rmi-energy-communities
+"""
 from pathlib import Path
 
 import pandas as pd

diff --git a/src/dbcp/transform/eip_infrastructure.py b/src/dbcp/transform/eip_infrastructure.py
@@ -38,11 +38,11 @@ def _format_column_names(cols: Sequence[str]) -> List[str]:
 
 
 def _fix_erroneous_array_items(ser: pd.Series, split_on=",", regex=False) -> pd.Series:
-    """Split on commas, preserve only the first value, and cast to numeric.
+    """Split on a delimiter and preserve only the first value.
 
     Several columns in EIP data should be numeric types but a small number of erroneous
-    values forces them to object dtype. The erroneous pattern is for the number to simply
-    be duplicated as a CSV string. For example, 0.2 appears as '0.2, 0.2'.
+    values forces them to object dtype. The erroneous pattern is for the value to be
+    duplicated as a CSV string. For example, 0.2 appears as '0.2, 0.2'.
 
     Args:
         ser (pd.Series): values to fix
@@ -120,9 +120,8 @@ def facilities_transform(raw_fac_df: pd.DataFrame) -> pd.DataFrame:
         "raw_wastewater_discharge_indicator",
     ]
     for col in should_be_numeric:
-        if not pd.api.types.is_numeric_dtype(fac[col]):
-            new = _fix_erroneous_array_items(fac[col])
-            fac[col] = pd.to_numeric(new, errors="raise")
+        new = _fix_erroneous_array_items(fac[col])
+        fac[col] = pd.to_numeric(new, errors="raise")
 
     fac.loc[:, "is_ccs"] = _convert_string_to_boolean(fac.loc[:, "raw_is_ccs"])
 
@@ -161,8 +160,8 @@ def facilities_transform(raw_fac_df: pd.DataFrame) -> pd.DataFrame:
     )
 
     duplicative_columns = [  # these are raw names
-        # These columns are just a concatenation of the names and IDs corresponding to the ID columns
-        # They add no information and invite inconsistency
+        # These columns are just a concatenation of the names and IDs corresponding to
+        # the ID columns. They add no information and invite inconsistency
         "Company",
         "Project",
         "Associated Facilities",
@@ -236,10 +235,9 @@ def projects_transform(raw_proj_df: pd.DataFrame) -> pd.DataFrame:
     ]
     for col in should_be_numeric:
         # these columns suffer from occasional duplicate values as CSV for some reason.
-        # Like "1.0, 1.0". The second number is never different.
-        if not pd.api.types.is_numeric_dtype(proj[col]):
-            new = _fix_erroneous_array_items(proj[col])
-            proj[col] = pd.to_numeric(new, errors="raise")
+        # Like "1.0, 1.0". The second number is never different. [validate this?]
+        new = _fix_erroneous_array_items(proj[col])
+        proj[col] = pd.to_numeric(new, errors="raise")
 
     proj.loc[:, "is_ccs"] = _convert_string_to_boolean(proj.loc[:, "raw_is_ccs"])
     proj.loc[:, "is_ally_target"] = _convert_string_to_boolean(
@@ -248,10 +246,13 @@ def projects_transform(raw_proj_df: pd.DataFrame) -> pd.DataFrame:
 
     # manual correction for project with 92 Billion dollar cost (lol). Googled it and
     # it was supposed to be 9.2 Billion
-    proj.loc[
+    to_correct = proj.loc[
         proj["name"].eq("Gron Fuels' Renewable Fuels Plant - Initial Construction"),
         "cost_millions",
-    ] *= 0.1
+    ]
+    assert len(to_correct) == 1, "Expected one project to correct."
+    assert to_correct.ge(9000).all(), "Expected erroneous cost over 9 billion."
+    to_correct *= 0.1
     # manual fix. One project's facility id doesn't exist. The project is the Oil part
     # of the willow Project. The next project ID belongs to the gas part, and its
     # facility ID does exist. So I assign the oil facility ID to the gas facility ID.
@@ -442,3 +443,15 @@ def transform(raw_eip_dfs: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]:
     }
 
     return out
+
+
+if __name__ == "__main__":
+    #  debugging entry point
+    from pathlib import Path
+
+    from dbcp.extract.eip_infrastructure import extract
+
+    source_path = Path("/app/data/raw/2023.05.24 OGW database.xlsx")
+    eip_raw_dfs = extract(source_path)
+    eip_transformed_dfs = transform(eip_raw_dfs)
+    print("yay")
diff --git a/src/dbcp/transform/fips_tables.py b/src/dbcp/transform/fips_tables.py
@@ -36,7 +36,7 @@ def _add_tribal_land_frac(
     dissolved_tribal = tribal_land.dissolve()
     dissolved_tribal_geometry = dissolved_tribal.geometry.iloc[0]
 
-    # Calculate intersection, convert to km
+    # Calculate intersection, convert m^2 to km^2
     counties["tribal_land_intersection"] = (
         counties.intersection(dissolved_tribal_geometry).area / 1e6
     )

diff --git a/src/dbcp/transform/justice40.py b/src/dbcp/transform/justice40.py
@@ -156,7 +156,7 @@ def transform(raw_j40: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]:
     out_df.drop(columns="", inplace=True)
     out_df.loc[:, "tract_id_fips"] = _fips_int_to_string(out_df.loc[:, "tract_id_fips"])
 
-    # Correct percents
+    # Correct percentage errors and convert to fractions
     percent_cols = list(filter(lambda col: col.endswith("_percent"), list(out_df)))
     for col in percent_cols:
         col_max = out_df[col].max()