Skip to content

Commit

Permalink
Merge pull request #369 from deployment-gap-model-education-fund/more…
Browse files Browse the repository at this point in the history
…_docs

More docs
  • Loading branch information
bendnorman authored Oct 30, 2024
2 parents 5d2b3cf + 3307e35 commit a4ec1a5
Show file tree
Hide file tree
Showing 10 changed files with 60 additions and 29 deletions.
8 changes: 7 additions & 1 deletion src/dbcp/extract/eip_infrastructure.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
"""Retrieve data from EIP Infrastructure spreadsheets for analysis."""
"""Retrieve data from EIP Infrastructure spreadsheets for analysis.
This data was updated by contacting EIP directly for the latest version, but now they
host an Excel file at oilandgaswatch.org -> Resources -> Downloads, which points to:
https://drive.google.com/drive/folders/1udtw3XeezA5Lkb8Mfc_cntNTcV4oPuKi
Note that this new data version has changed structure from the one extracted below.
"""
from pathlib import Path
from typing import Dict

Expand Down
2 changes: 1 addition & 1 deletion src/dbcp/extract/epa_avert.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Retrieve data from EIP Infrastructure spreadsheets for analysis."""
"""Retrieve data from EPA AVERT avoided carbon modeling."""
from pathlib import Path

import pandas as pd
Expand Down
9 changes: 3 additions & 6 deletions src/dbcp/extract/fips_tables.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Extract canonical state and county FIPS tables from the addfips library."""
"""Extract canonical state and county FIPS tables."""
from functools import lru_cache
from importlib.resources import files
from typing import Dict
Expand Down Expand Up @@ -41,10 +41,7 @@ def extract_census_tribal_land(archive_uri: str) -> pd.DataFrame:


def _extract_state_fips() -> pd.DataFrame:
"""Extract canonical state and county FIPS tables from census data and the addfips library.
Args:
vintage (int, optional): which Census year to use. Defaults to FIPS_CODE_VINTAGE.
"""Extract canonical state FIPS tables from the addfips library.
Returns:
Dict[str, pd.DataFrame]: output dictionary of dataframes
Expand All @@ -56,7 +53,7 @@ def _extract_state_fips() -> pd.DataFrame:


def extract_fips(census_uri: str) -> Dict[str, pd.DataFrame]:
"""Extract canonical state and county FIPS tables from census data and the addfips library.
"""Extract canonical FIPS tables from census data and the addfips library.
Returns:
Dict[str, pd.DataFrame]: output dictionary of dataframes
Expand Down
4 changes: 2 additions & 2 deletions src/dbcp/extract/lbnl_iso_queue.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Retrieve data from the 20201 LBNL ISO Queue spreadsheet for analysis."""
"""Retrieve data from the LBNL ISO Queue spreadsheet."""
from typing import Dict

import pandas as pd
Expand All @@ -10,7 +10,7 @@ def extract(uri: str) -> Dict[str, pd.DataFrame]:
"""Read Excel file with LBNL ISO Queue dataset.
Args:
uri: uri of data in GCS relatives to the root.
uri: uri of data in GCS relative to the root.
Returns:
dfs: dictionary of dataframe name to raw dataframe.
Expand Down
7 changes: 6 additions & 1 deletion src/dbcp/extract/local_opposition.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
"""Extraction logic for Columbia Local Opposition dataset."""
"""Extraction logic for Columbia Local Opposition dataset.
This dataset is a .docx file with a hierarchical structure. The hierarchy is denoted by
formatting details (paragraph level, font, etc), but is surprisingly consistent. It is
infrequently updated by a research group at Columbia University.
"""
from pathlib import Path
from typing import Dict, List, Optional

Expand Down
9 changes: 8 additions & 1 deletion src/dbcp/extract/protected_area_by_county.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
"""Extract data from USGS PAD-US intersected with TIGER county shapefiles."""
"""Extract data from USGS PAD-US intersected with TIGER county shapefiles.
This data is derived from the Protected Areas Database of the United States (PAD-US) and
intersected with TIGER county shapefiles. It was prototyped in a notebook but was never
moved into a standalone module. The data loaded here is created in
notebooks/23-tpb-check_federal_lands.ipynb. Ideally this data would be re-created in
a module and loaded here, with a disk cache if necessary for performance.
"""
from pathlib import Path

import pandas as pd
Expand Down
5 changes: 4 additions & 1 deletion src/dbcp/extract/rmi_energy_communities.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
"""Extract data from RMI's energy communities analysis."""
"""Extract data from RMI/Catalyst energy communities analysis.
Source repo: https://github.com/catalyst-cooperative/rmi-energy-communities
"""
from pathlib import Path

import pandas as pd
Expand Down
41 changes: 27 additions & 14 deletions src/dbcp/transform/eip_infrastructure.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ def _format_column_names(cols: Sequence[str]) -> List[str]:


def _fix_erroneous_array_items(ser: pd.Series, split_on=",", regex=False) -> pd.Series:
"""Split on commas, preserve only the first value, and cast to numeric.
"""Split on a delimiter and preserve only the first value.
Several columns in EIP data should be numeric types but a small number of erroneous
values forces them to object dtype. The erroneous pattern is for the number to simply
be duplicated as a CSV string. For example, 0.2 appears as '0.2, 0.2'.
values forces them to object dtype. The erroneous pattern is for the value to be
duplicated as a CSV string. For example, 0.2 appears as '0.2, 0.2'.
Args:
ser (pd.Series): values to fix
Expand Down Expand Up @@ -120,9 +120,8 @@ def facilities_transform(raw_fac_df: pd.DataFrame) -> pd.DataFrame:
"raw_wastewater_discharge_indicator",
]
for col in should_be_numeric:
if not pd.api.types.is_numeric_dtype(fac[col]):
new = _fix_erroneous_array_items(fac[col])
fac[col] = pd.to_numeric(new, errors="raise")
new = _fix_erroneous_array_items(fac[col])
fac[col] = pd.to_numeric(new, errors="raise")

fac.loc[:, "is_ccs"] = _convert_string_to_boolean(fac.loc[:, "raw_is_ccs"])

Expand Down Expand Up @@ -161,8 +160,8 @@ def facilities_transform(raw_fac_df: pd.DataFrame) -> pd.DataFrame:
)

duplicative_columns = [ # these are raw names
# These columns are just a concatenation of the names and IDs corresponding to the ID columns
# They add no information and invite inconsistency
# These columns are just a concatenation of the names and IDs corresponding to
# the ID columns. They add no information and invite inconsistency
"Company",
"Project",
"Associated Facilities",
Expand Down Expand Up @@ -236,10 +235,9 @@ def projects_transform(raw_proj_df: pd.DataFrame) -> pd.DataFrame:
]
for col in should_be_numeric:
# these columns suffer from occasional duplicate values as CSV for some reason.
# Like "1.0, 1.0". The second number is never different.
if not pd.api.types.is_numeric_dtype(proj[col]):
new = _fix_erroneous_array_items(proj[col])
proj[col] = pd.to_numeric(new, errors="raise")
# Like "1.0, 1.0". The second number is never different. [validate this?]
new = _fix_erroneous_array_items(proj[col])
proj[col] = pd.to_numeric(new, errors="raise")

proj.loc[:, "is_ccs"] = _convert_string_to_boolean(proj.loc[:, "raw_is_ccs"])
proj.loc[:, "is_ally_target"] = _convert_string_to_boolean(
Expand All @@ -248,10 +246,13 @@ def projects_transform(raw_proj_df: pd.DataFrame) -> pd.DataFrame:

# manual correction for project with 92 Billion dollar cost (lol). Googled it and
# it was supposed to be 9.2 Billion
proj.loc[
to_correct = proj.loc[
proj["name"].eq("Gron Fuels' Renewable Fuels Plant - Initial Construction"),
"cost_millions",
] *= 0.1
]
assert len(to_correct) == 1, "Expected one project to correct."
assert to_correct.ge(9000).all(), "Expected erroneous cost over 9 billion."
to_correct *= 0.1
# manual fix. One project's facility id doesn't exist. The project is the Oil part
# of the willow Project. The next project ID belongs to the gas part, and its
# facility ID does exist. So I assign the oil facility ID to the gas facility ID.
Expand Down Expand Up @@ -442,3 +443,15 @@ def transform(raw_eip_dfs: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]:
}

return out


if __name__ == "__main__":
# debugging entry point
from pathlib import Path

from dbcp.extract.eip_infrastructure import extract

source_path = Path("/app/data/raw/2023.05.24 OGW database.xlsx")
eip_raw_dfs = extract(source_path)
eip_transformed_dfs = transform(eip_raw_dfs)
print("yay")
2 changes: 1 addition & 1 deletion src/dbcp/transform/fips_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _add_tribal_land_frac(
dissolved_tribal = tribal_land.dissolve()
dissolved_tribal_geometry = dissolved_tribal.geometry.iloc[0]

# Calculate intersection, convert to km
# Calculate intersection, convert m^2 to km^2
counties["tribal_land_intersection"] = (
counties.intersection(dissolved_tribal_geometry).area / 1e6
)
Expand Down
2 changes: 1 addition & 1 deletion src/dbcp/transform/justice40.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def transform(raw_j40: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]:
out_df.drop(columns="", inplace=True)
out_df.loc[:, "tract_id_fips"] = _fips_int_to_string(out_df.loc[:, "tract_id_fips"])

# Correct percents
# Correct percentage errors and convert to fractions
percent_cols = list(filter(lambda col: col.endswith("_percent"), list(out_df)))
for col in percent_cols:
col_max = out_df[col].max()
Expand Down

0 comments on commit a4ec1a5

Please sign in to comment.