Skip to content

Commit

Permalink
Merge branch 'issue-3770-transform-phmsagas-data' of https://github.c…
Browse files Browse the repository at this point in the history
…om/seeess1/pudl into issue-3770-transform-phmsagas-data
  • Loading branch information
seeess1 committed Dec 6, 2024
2 parents 9f57d77 + 05cc8ce commit 94d4d5d
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 15 deletions.
4 changes: 2 additions & 2 deletions src/pudl/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2261,8 +2261,8 @@ def analyze_missing_values(
PLEASE NOTE: No calls to this method should be included in any final
transformation scripts. This is purely for analysis and does not perform
any data transformation or cleaning.
any data transformation or cleaning.
This function checks each column for missing or custom missing values
and logs a summary of the findings for string (object), numeric, and
datetime columns.
Expand Down
34 changes: 21 additions & 13 deletions src/pudl/transform/phmsagas.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""Classes & functions to process PHMSA natural gas data before loading into the PUDL DB."""

from dataclasses import dataclass

import pandas as pd
from dagster import AssetCheckResult, AssetChecksDefinition, AssetIn, asset, asset_check
from dataclasses import dataclass

import pudl.logging_helpers
from pudl.helpers import (
Expand Down Expand Up @@ -70,13 +71,13 @@
"services_efv_installed",
"services_shutoff_valve_in_system",
"services_shutoff_valve_installed",
"federal_land_leaks_repaired_or_scheduled"
"federal_land_leaks_repaired_or_scheduled",
],
"capitalization_exclusion": [
"headquarters_address_state",
"headquarters_address_state",
"office_address_state",
"preparer_email"
]
"preparer_email",
],
}

##############################################################################
Expand All @@ -92,8 +93,8 @@
def core_phmsagas__yearly_distribution_operators(
raw_data: pd.DataFrame,
) -> pd.DataFrame:
"""Pull and transform the yearly distribution PHMSA data into operator-level data.
"""Pull and transform the yearly distribution PHMSA data into operator-level data.
Transformations include:
* Standardize NAs.
Expand Down Expand Up @@ -178,11 +179,12 @@ def core_phmsagas__yearly_distribution_operators(

# Standardize telephone and fax number format and drop (000)-000-0000
df = standardize_phone_column(df, ["preparer_phone", "preparer_fax"])

pdb.set_trace()

return df


@dataclass
class PhmsagasCheckSpec:
"""Define some simple checks that can run on FERC 714 assets."""
Expand All @@ -191,27 +193,33 @@ class PhmsagasCheckSpec:
asset: str
percent_unaccounted_for_gas_negative_threshold: float


check_specs = [
PhmsagasCheckSpec(
name="phmsagas__yearly_distribution_check_spec",
asset="raw_phmsagas__yearly_distribution",
# Threshold to use when making sure we aren't seeing tons of negative values in percent_unaccounted_for_gas
percent_unaccounted_for_gas_negative_threshold = .05
percent_unaccounted_for_gas_negative_threshold=0.05,
)
]

def make_check_phmsagas_yearly_distribution(spec: PhmsagasCheckSpec) -> AssetChecksDefinition:

def make_check_phmsagas_yearly_distribution(
spec: PhmsagasCheckSpec,
) -> AssetChecksDefinition:
"""Turn the Ferc714CheckSpec into an actual Dagster asset check."""

@asset_check(asset=spec.asset, blocking=True)
def _check(df):

# Count the rows where percent_unaccounted_for_gas is negative
negative_count = (df["percent_unaccounted_for_gas"] < 0).sum()

# Calculate the percentage
negative_percentage = (negative_count / len(df))
if negative_percentage > PhmsagasCheckSpec.percent_unaccounted_for_gas_negative_threshold:
negative_percentage = negative_count / len(df)
if (
negative_percentage
> PhmsagasCheckSpec.percent_unaccounted_for_gas_negative_threshold
):
error = "Percentage of rows with negative percent_unaccounted_for_gas values: {negative_percentage:.2f}"
logger.info(error)

Expand Down

0 comments on commit 94d4d5d

Please sign in to comment.