From e4e4095f587c7b2e91daad9fc815a78918c82068 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Tue, 3 Oct 2023 10:36:52 -0600 Subject: [PATCH 01/37] Split calculation checks into 3 steps; make calculation_tolerance TransformParam --- src/pudl/output/ferc1.py | 51 ++++----- src/pudl/transform/ferc1.py | 161 ++++++++++++++++++++--------- src/pudl/transform/params/ferc1.py | 12 ++- 3 files changed, 140 insertions(+), 84 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 51d0c0bfb0..e86de67b00 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -12,42 +12,23 @@ from matplotlib import pyplot as plt from networkx.drawing.nx_agraph import graphviz_layout from pandas._libs.missing import NAType as pandas_NAType -from pydantic import BaseModel, confloat, validator +from pydantic import BaseModel, validator import pudl +from pudl.transform.ferc1 import CalculationTolerance logger = pudl.logging_helpers.get_logger(__name__) -class CalculationToleranceFerc1(BaseModel): - """Data quality expectations related to FERC 1 calculations. - - We are doing a lot of comparisons between calculated and reported values to identify - reporting errors in the data, errors in FERC's metadata, and bugs in our own code. - This class provides a structure for encoding our expectations about the level of - acceptable (or at least expected) errors, and allows us to pass them around. - - In the future we might also want to specify much more granular expectations, - pertaining to individual tables, years, utilities, or facts to ensure that we don't - have low overall error rates, but a problem with the way the data or metadata is - reported in a particular year. We could also define per-filing and per-table error - tolerances to help us identify individual utilities that have e.g. used an outdated - version of Form 1 when filing. - """ - - intertable_calculation_errors: confloat(ge=0.0, le=1.0) = 0.05 - """Fraction of interatble calculations that are allowed to not match exactly.""" - - -EXPLOSION_CALCULATION_TOLERANCES: dict[str, CalculationToleranceFerc1] = { - "income_statement_ferc1": CalculationToleranceFerc1( - intertable_calculation_errors=0.20, +EXPLOSION_CALCULATION_TOLERANCES: dict[str, CalculationTolerance] = { + "income_statement_ferc1": CalculationTolerance( + bulk_error_rate=0.20, ), - "balance_sheet_assets_ferc1": CalculationToleranceFerc1( - intertable_calculation_errors=0.65, + "balance_sheet_assets_ferc1": CalculationTolerance( + bulk_error_rate=0.65, ), - "balance_sheet_liabilities_ferc1": CalculationToleranceFerc1( - intertable_calculation_errors=0.07, + "balance_sheet_liabilities_ferc1": CalculationTolerance( + bulk_error_rate=0.07, ), } @@ -980,7 +961,7 @@ def exploded_table_asset_factory( root_table: str, table_names_to_explode: list[str], seed_nodes: list[NodeId], - calculation_tolerance: CalculationToleranceFerc1, + calculation_tolerance: CalculationTolerance, io_manager_key: str | None = None, ) -> AssetsDefinition: """Create an exploded table based on a set of related input tables.""" @@ -1112,7 +1093,7 @@ def __init__( calculation_components_xbrl_ferc1: pd.DataFrame, seed_nodes: list[NodeId], tags: pd.DataFrame = pd.DataFrame(), - calculation_tolerance: CalculationToleranceFerc1 = CalculationToleranceFerc1(), + calculation_tolerance: CalculationTolerance = CalculationTolerance(), ): """Instantiate an Exploder class. @@ -1475,9 +1456,13 @@ def generate_intertable_calculations( calculated_df = pudl.transform.ferc1.check_calculation_metrics( calculated_df=calculated_df, value_col=self.value_col, - calculation_tolerance=self.calculation_tolerance.intertable_calculation_errors, + calculation_tolerance=self.calculation_tolerance.bulk_error_rate, + table_name=self.root_table, + ) + calculated_df = pudl.transform.ferc1.add_corrections( + calculated_df=calculated_df, + value_col=self.value_col, table_name=self.root_table, - add_corrections=True, ) return calculated_df @@ -1529,7 +1514,7 @@ class XbrlCalculationForestFerc1(BaseModel): exploded_calcs: pd.DataFrame = pd.DataFrame() seeds: list[NodeId] = [] tags: pd.DataFrame = pd.DataFrame() - calculation_tolerance: CalculationToleranceFerc1 = CalculationToleranceFerc1() + calculation_tolerance: CalculationTolerance = CalculationTolerance() class Config: """Allow the class to store a dataframe.""" diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index a0dde28010..b5fc3c9cff 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -21,7 +21,7 @@ import sqlalchemy as sa from dagster import AssetIn, AssetsDefinition, asset from pandas.core.groupby import DataFrameGroupBy -from pydantic import validator +from pydantic import confloat, validator import pudl from pudl.analysis.classify_plants_ferc1 import ( @@ -726,6 +726,35 @@ def combine_axis_columns_xbrl( return df +class CalculationTolerance(TransformParams): + """Data quality expectations related to FERC 1 calculations. + + We are doing a lot of comparisons between calculated and reported values to identify + reporting errors in the data, errors in FERC's metadata, and bugs in our own code. + This class provides a structure for encoding our expectations about the level of + acceptable (or at least expected) errors, and allows us to pass them around. + + In the future we might also want to specify much more granular expectations, + pertaining to individual tables, years, utilities, or facts to ensure that we don't + have low overall error rates, but a problem with the way the data or metadata is + reported in a particular year. We could also define per-filing and per-table error + tolerances to help us identify individual utilities that have e.g. used an outdated + version of Form 1 when filing. + + NOTE: It may make sense to consolidate with :class:`ReconcileTableCalculations` + once the refactoring of the subtotals and the calculation corrections is done. + """ + + isclose_rtol: confloat(ge=0.0, le=1e-2) = 1e-5 + """Relative tolerance to use in :func:`np.isclose` for determining equality.""" + + isclose_atol: confloat(ge=0.0) = 1e-8 + """Absolute tolerance to use in :func:`np.isclose` for determining equality.""" + + bulk_error_rate: confloat(ge=0.0, le=1.0) = 0.05 + """Fraction of interatble calculations that are allowed to not match exactly.""" + + class ReconcileTableCalculations(TransformParams): """Parameters for reconciling xbrl-metadata based calculations within a table.""" @@ -735,7 +764,7 @@ class ReconcileTableCalculations(TransformParams): This will typically be ``dollar_value`` or ``ending_balance`` column for the income statement and the balance sheet tables. """ - calculation_tolerance: float = 0.05 + calculation_tolerance: CalculationTolerance = CalculationTolerance() """Fraction of calculated values which we allow not to match reported values.""" subtotal_column: str | None = None @@ -752,7 +781,6 @@ def reconcile_table_calculations( xbrl_factoid_name: str, table_name: str, params: ReconcileTableCalculations, - add_corrections: bool = True, ) -> pd.DataFrame: """Ensure intra-table calculated values match reported values within a tolerance. @@ -769,13 +797,11 @@ def reconcile_table_calculations( do not fail the :func:`numpy.isclose()` test and so are not corrected. Args: - df: processed table. + df: processed table containing data values to check. calculation_components: processed calculation component metadata. xbrl_factoid_name: column name of the XBRL factoid in the processed table. table_name: name of the PUDL table. params: :class:`ReconcileTableCalculations` parameters. - add_corrections: Whether or not to create _correction records that force all - calculations to add up correctly. """ # If we don't have this value, we aren't doing any calculation checking: if params.column_to_check is None or calculation_components.empty: @@ -851,7 +877,11 @@ def reconcile_table_calculations( value_col=params.column_to_check, calculation_tolerance=params.calculation_tolerance, table_name=table_name, - add_corrections=add_corrections, + ) + calculated_df = add_corrections( + calculated_df=calculated_df, + value_col=params.column_to_check, + table_name=table_name, ).rename(columns={"xbrl_factoid": xbrl_factoid_name}) # Check that sub-total calculations sum to total. @@ -878,13 +908,13 @@ def reconcile_table_calculations( ) if sub_total_errors.ngroups > 0: logger.warning( - f"{table_name}: has {sub_total_errors.ngroups} ({off_ratio_sub:.02%}) sub-total calculations that don't " - "sum to the equivalent total column." + f"{table_name}: has {sub_total_errors.ngroups} ({off_ratio_sub:.02%}) " + "sub-total calculations that don't sum to the equivalent total column." ) if off_ratio_sub > params.subtotal_calculation_tolerance: raise AssertionError( - f"Sub-total calculations in {table_name} are off by {off_ratio_sub}. Expected tolerance " - f"of {params.subtotal_calculation_tolerance}." + f"Sub-total calculations in {table_name} are off by {off_ratio_sub}. " + f"Expected tolerance of {params.subtotal_calculation_tolerance}." ) return calculated_df @@ -906,8 +936,6 @@ def calculate_values_from_components( data: exploded FERC data to apply the calculations to. Primary key should be ``report_year``, ``utility_id_ferc1``, ``table_name``, ``xbrl_factoid``, and whatever additional dimensions are relevant to the data. - validate: type of merge validation to apply when initially merging the calculation - components (left) and the data (right). calc_idx: primary key columns that uniquely identify a calculation component (not including the ``_parent`` columns). value_col: label of the column in ``data`` that contains the values to apply the @@ -959,7 +987,7 @@ def calculate_values_from_components( ].empty calculated_df = calculated_df.drop(columns=["_merge"]) - # # Force value_col to be a float to prevent any hijinks with calculating differences. + # Force value_col to be a float to prevent any hijinks with calculating differences. calculated_df[value_col] = calculated_df[value_col].astype(float) return calculated_df @@ -967,9 +995,8 @@ def calculate_values_from_components( def check_calculation_metrics( calculated_df: pd.DataFrame, value_col: str, - calculation_tolerance: float, + calculation_tolerance: CalculationTolerance, table_name: str, - add_corrections: bool = True, ) -> pd.DataFrame: """Run the calculation metrics and determine if calculations are within tolerance.""" # Data types were very messy here, including pandas Float64 for the @@ -987,49 +1014,88 @@ def check_calculation_metrics( ), ) - off_df = calculated_df[ + # DO ERROR CHECKS + # off_df is pretty specific to the one check that we're doing now, but is also + # useful in creating the corrections later one. + # Find a simpler way to check this particular metric, and defer the creation of + # this metric until the new add_corrections() function. + + # Identify records where the calculated and reported values are not equal within + # some tolerance (defined here by the np.isclose defaults). Ignore records for + # which the absolute difference "abs_dfiff" is NA since that indicates one or the + # other of the calculated or reported values was NA. + all_errors = calculated_df[ ~np.isclose(calculated_df.calculated_amount, calculated_df[value_col]) & (calculated_df["abs_diff"].notnull()) ] - calculated_values = calculated_df[(calculated_df.abs_diff.notnull())] - if calculated_values.empty: + # Why does it make sense to compare against records where abs_diff is non-null? + # Why wouldn't we want to look at records where calculated_amount is non-null? + non_null_calculated_df = calculated_df.dropna(subset="abs_diff") + if non_null_calculated_df.empty: # Will only occur if all reported values are NaN when calculated values # exist, or vice versa. logger.warning( - "Warning: No calculated values have a corresponding reported value in the table." + "Calculated values have no corresponding reported values in this table." ) - off_ratio = np.nan + bulk_error_rate = np.nan else: - off_ratio = len(off_df) / len(calculated_values) - if off_ratio > calculation_tolerance: - raise AssertionError( - f"Calculations in {table_name} are off by {off_ratio:.2%}. Expected tolerance " - f"of {calculation_tolerance:.1%}." - ) + bulk_error_rate = len(all_errors) / len(non_null_calculated_df) - # We'll only get here if the proportion of calculations that are off is acceptable - if (off_ratio > 0 or np.isnan(off_ratio)) and add_corrections: - logger.info( - f"{table_name}: has {len(off_df)} ({off_ratio:.02%}) records whose " - "calculations don't match. Adding correction records to make calculations " - "match reported values." - ) - corrections = off_df.copy() - corrections[value_col] = ( - corrections[value_col].fillna(0.0) - corrections["calculated_amount"] - ) - corrections["original_factoid"] = corrections["xbrl_factoid"] - corrections["xbrl_factoid"] = corrections["xbrl_factoid"] + "_correction" - corrections["row_type_xbrl"] = "correction" - corrections["is_within_table_calc"] = False - corrections["record_id"] = pd.NA - - calculated_df = pd.concat( - [calculated_df, corrections], axis="index" - ).reset_index() + if bulk_error_rate > calculation_tolerance.bulk_error_rate: + raise AssertionError( + f"Bulk error rate of calculations in {table_name} is {bulk_error_rate:.2%}. " + f"Accepted tolerance is {calculation_tolerance.bulk_error_rate:.1%}." + ) + + # Given a particular set of groupby() columns, calculate the fraction of records in + # each group whose calculations do not match reported values based on np.isclose() + # E.g. within each `report_year`. return calculated_df +def add_corrections( + calculated_df: pd.DataFrame, + value_col: str, + table_name: str, +) -> pd.DataFrame: + """Add corrections to discrepancies between reported & calculated values. + + To isolate the sources of error, and ensure that all totals add up as expected in + later phases of the transformation, we add correction records to the dataframe + which compensate for any difference between the calculated and reported values. The + ``_correction`` factoids that are added here have already been added to the + calculation components during the metadata processing. + + TODO: Pass in :class:`CalculationTolerance` and use rtol & atol in np.isclose(). + + Args: + calculated_df: DataFrame containing the data to correct. Must already have + ``abs_diff`` column that was added by :func:`check_calculation_metrics` + value_col: Label of the column whose values are being calculated. + table_name: Name of the table whose data we are working with. For logging. + """ + corrections = calculated_df[ + ~np.isclose(calculated_df["calculated_amount"], calculated_df[value_col]) + & (calculated_df["abs_diff"].notnull()) + ] + logger.info(f"{len(corrections)=}") + + corrections = corrections.assign( + value_col=lambda x: x[value_col].fillna(0.0) - x["calculated_amount"], + original_factoid=lambda x: x["xbrl_factoid"], + xbrl_factoid=lambda x: x["xbrl_factoid"] + "_correction", + row_type_xbrl="correction", + is_within_table_calc=False, + record_id=pd.NA, + ) + logger.info( + f"{table_name}: adding {len(corrections)} corrections to " + f"{len(calculated_df)} original records." + ) + + return pd.concat([calculated_df, corrections], axis="index").reset_index() + + class Ferc1TableTransformParams(TableTransformParams): """A model defining what TransformParams are allowed for FERC Form 1. @@ -2394,7 +2460,6 @@ def reconcile_table_calculations( xbrl_factoid_name=self.params.xbrl_factoid_name, table_name=self.table_id.value, params=params, - add_corrections=True, ) return df diff --git a/src/pudl/transform/params/ferc1.py b/src/pudl/transform/params/ferc1.py index b58a0e2b6f..a389878898 100644 --- a/src/pudl/transform/params/ferc1.py +++ b/src/pudl/transform/params/ferc1.py @@ -2727,7 +2727,9 @@ # Known issue with reporting of construction in progress not classified in classified fields of table. "reconcile_table_calculations": { "column_to_check": "ending_balance", - "calculation_tolerance": 0.08, + "calculation_tolerance": { + "bulk_error_rate": 0.08, + }, }, }, "plants_pumped_storage_ferc1": { @@ -3715,7 +3717,9 @@ "strip_non_numeric_values": {"amount": {"strip_non_numeric_values": True}}, "reconcile_table_calculations": { "column_to_check": "ending_balance", - "calculation_tolerance": 0.08, + "calculation_tolerance": { + "bulk_error_rate": 0.08, + }, }, }, "income_statement_ferc1": { @@ -3982,7 +3986,9 @@ "column_to_check": "dollar_value", # Note: this table does not currently get exploded. It will require # additional debugging at a later date. - "calculation_tolerance": 0.4, + "calculation_tolerance": { + "bulk_error_rate": 0.4, + }, }, }, "electric_plant_depreciation_functional_ferc1": { From ec7bf0e5e7a872e0216c27b8c21184d0d703c6d4 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Fri, 6 Oct 2023 10:05:56 -0600 Subject: [PATCH 02/37] Add error checking functions. --- src/pudl/output/ferc1.py | 9 +- src/pudl/transform/ferc1.py | 238 +++++++++++++++++++++++++---- src/pudl/transform/params/ferc1.py | 6 +- 3 files changed, 216 insertions(+), 37 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index e86de67b00..56af38e3d1 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -22,13 +22,13 @@ EXPLOSION_CALCULATION_TOLERANCES: dict[str, CalculationTolerance] = { "income_statement_ferc1": CalculationTolerance( - bulk_error_rate=0.20, + bulk_error_frequency=0.20, ), "balance_sheet_assets_ferc1": CalculationTolerance( - bulk_error_rate=0.65, + bulk_error_frequency=0.65, ), "balance_sheet_liabilities_ferc1": CalculationTolerance( - bulk_error_rate=0.07, + bulk_error_frequency=0.07, ), } @@ -1456,12 +1456,13 @@ def generate_intertable_calculations( calculated_df = pudl.transform.ferc1.check_calculation_metrics( calculated_df=calculated_df, value_col=self.value_col, - calculation_tolerance=self.calculation_tolerance.bulk_error_rate, + calculation_tolerance=self.calculation_tolerance, table_name=self.root_table, ) calculated_df = pudl.transform.ferc1.add_corrections( calculated_df=calculated_df, value_col=self.value_col, + calculation_tolerance=self.calculation_tolerance, table_name=self.root_table, ) return calculated_df diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index b5fc3c9cff..624fe6eb8d 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -13,7 +13,7 @@ import json import re from collections import namedtuple -from collections.abc import Mapping +from collections.abc import Callable, Mapping from typing import Any, Literal, Self import numpy as np @@ -743,16 +743,53 @@ class CalculationTolerance(TransformParams): NOTE: It may make sense to consolidate with :class:`ReconcileTableCalculations` once the refactoring of the subtotals and the calculation corrections is done. + + NOTE: atol is currently one part in 100 million, could it be much larger? Like + one part in 1000? If numbers are off by a 0.1 cents, do we care? + + NOTE: There should probably be some absolute magnitude checks in here. If we have an + error of $10 billion but it's less than 1% of the value in a table we probably still + want to know about it! """ - isclose_rtol: confloat(ge=0.0, le=1e-2) = 1e-5 + isclose_rtol: confloat(ge=0.0) = 1e-5 """Relative tolerance to use in :func:`np.isclose` for determining equality.""" - isclose_atol: confloat(ge=0.0) = 1e-8 - """Absolute tolerance to use in :func:`np.isclose` for determining equality.""" + isclose_atol: confloat(ge=0.0, le=0.01) = 1e-8 + """Absolute tolerance to use in :func:`np.isclose` for determining equality. + + Since we are comparing financial values, and we want them to be equal only insofar + as they would be financially indistinguishable, they should never differ by more + than a penny, hence the upper bound of 0.01. + """ + + bulk_error_frequency: confloat(ge=0.0, le=1.0) = 0.05 + """Fraction of all calculations that are allowed to not match exactly.""" + + bulk_error_relative_magnitude: confloat(ge=0.0) = 0.01 + """Maximum allowed sum of all errors, relative to the sum of all reported values.""" + + bulk_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.05 + """Fraction of records with non-null reported values and null calculated values.""" + + bulk_null_reported_value_frequency: confloat(ge=0.0, le=1.0) = 0.50 + """Fraction of records with non-null reported values and null calculated values.""" - bulk_error_rate: confloat(ge=0.0, le=1.0) = 0.05 - """Fraction of interatble calculations that are allowed to not match exactly.""" + utility_id_ferc1_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 + utility_id_ferc1_error_relative_magnitude: confloat(ge=0.0) = 0.001 + utility_id_ferc1_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 + + report_year_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 + report_year_error_relative_magnitude: confloat(ge=0.0) = 0.001 + report_year_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 + + xbrl_factoid_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 + xbrl_factoid_error_relative_magnitude: confloat(ge=0.0) = 0.001 + xbrl_factoid_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 + + table_name_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 + table_name_error_relative_magnitude: confloat(ge=0.0) = 0.001 + table_name_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 class ReconcileTableCalculations(TransformParams): @@ -878,11 +915,13 @@ def reconcile_table_calculations( calculation_tolerance=params.calculation_tolerance, table_name=table_name, ) - calculated_df = add_corrections( - calculated_df=calculated_df, - value_col=params.column_to_check, - table_name=table_name, - ).rename(columns={"xbrl_factoid": xbrl_factoid_name}) + # calculated_df = add_corrections( + # calculated_df=calculated_df, + # value_col=params.column_to_check, + # calculation_tolerance=params.calculation_tolerance, + # table_name=table_name, + # ) + # calculated_df = calculated_df.rename(columns={"xbrl_factoid": xbrl_factoid_name}) # Check that sub-total calculations sum to total. if params.subtotal_column is not None: @@ -936,8 +975,8 @@ def calculate_values_from_components( data: exploded FERC data to apply the calculations to. Primary key should be ``report_year``, ``utility_id_ferc1``, ``table_name``, ``xbrl_factoid``, and whatever additional dimensions are relevant to the data. - calc_idx: primary key columns that uniquely identify a calculation component (not - including the ``_parent`` columns). + calc_idx: primary key columns that uniquely identify a calculation component + (not including the ``_parent`` columns). value_col: label of the column in ``data`` that contains the values to apply the calculations to (typically ``dollar_value`` or ``ending_balance``). """ @@ -1013,6 +1052,15 @@ def check_calculation_metrics( np.nan, ), ) + calculated_df["is_error"] = ( + ~np.isclose( + calculated_df["calculated_amount"], + calculated_df[value_col], + rtol=calculation_tolerance.isclose_rtol, + atol=calculation_tolerance.isclose_atol, + ) + & calculated_df["abs_diff"].notnull() + ) # DO ERROR CHECKS # off_df is pretty specific to the one check that we're doing now, but is also @@ -1024,10 +1072,8 @@ def check_calculation_metrics( # some tolerance (defined here by the np.isclose defaults). Ignore records for # which the absolute difference "abs_dfiff" is NA since that indicates one or the # other of the calculated or reported values was NA. - all_errors = calculated_df[ - ~np.isclose(calculated_df.calculated_amount, calculated_df[value_col]) - & (calculated_df["abs_diff"].notnull()) - ] + all_errors = calculated_df[calculated_df["is_error"]] + # Why does it make sense to compare against records where abs_diff is non-null? # Why wouldn't we want to look at records where calculated_amount is non-null? non_null_calculated_df = calculated_df.dropna(subset="abs_diff") @@ -1037,14 +1083,14 @@ def check_calculation_metrics( logger.warning( "Calculated values have no corresponding reported values in this table." ) - bulk_error_rate = np.nan + bulk_error_frequency = np.nan else: - bulk_error_rate = len(all_errors) / len(non_null_calculated_df) + bulk_error_frequency = len(all_errors) / len(non_null_calculated_df) - if bulk_error_rate > calculation_tolerance.bulk_error_rate: + if bulk_error_frequency > calculation_tolerance.bulk_error_frequency: raise AssertionError( - f"Bulk error rate of calculations in {table_name} is {bulk_error_rate:.2%}. " - f"Accepted tolerance is {calculation_tolerance.bulk_error_rate:.1%}." + f"Bulk error rate of calculations in {table_name} is {bulk_error_frequency:.2%}. " + f"Accepted tolerance is {calculation_tolerance.bulk_error_frequency:.1%}." ) # Given a particular set of groupby() columns, calculate the fraction of records in @@ -1053,9 +1099,131 @@ def check_calculation_metrics( return calculated_df +######################################################################################## +# Calculation Error Checking Functions +# - These functions all take a dataframe and return a float. +# - They are intended to be used in GroupBy.apply() (or on a whole dataframe). +# - They require a uniform `reported_value` column so that they can all have the same +# call signature, which allows us to iterate over all of them in a matrix. +######################################################################################## +def _is_valid_error_df(df: pd.DataFrame) -> bool: + """Helper function that verifies a dataframe is ready for error checking.""" + required_cols = [ + "is_error", + "reported_value", + "calculated_amount", + "abs_diff", + ] + return all(col in df.columns for col in required_cols) + + +def error_frequency(df: pd.DataFrame) -> float: + """Calculate the frequency with which records are tagged as errors.""" + try: + return df[df.is_error].shape[0] / df.shape[0] + except ZeroDivisionError: + # Will only occur if all reported values are NaN when calculated values + # exist, or vice versa. + logger.warning( + "Calculated values have no corresponding reported values in this table." + ) + return np.nan + + +def error_relative_magnitude(df: pd.DataFrame) -> float: + """Calculate the mangnitude of the errors relative to the reported value.""" + try: + # Should we be taking the absolute value of the reported column? + return ( + df[df.is_error].abs_diff.sum() + / df[df.is_error]["reported_value"].abs().sum() + ) + except ZeroDivisionError: + return np.nan + + +def error_absolute_magnitude(df: pd.DataFrame) -> float: + """Calculate the absolute magnitude of the errors in aggregate.""" + return df.abs_diff.sum() + + +def null_calculation_frequency(df: pd.DataFrame) -> float: + """Frequency with which calculated values are null when reported values are not.""" + non_null_reported = df["reported_value"].notnull() + logger.info(f"{non_null_reported.sum()=}") + null_calculated = df["calculated_amount"].isnull() + logger.info(f"{null_calculated.sum()=}") + return (non_null_reported & null_calculated).sum() / non_null_reported.sum() + + +def null_reported_value_frequency(df: pd.DataFrame) -> float: + """Frequency with which the reported values are Null.""" + return df["reported_value"].isnull().sum() / df.shape[0] + + +def aggregate_errors( + df: pd.DataFrame, gb_col: str | None, metric: Callable[[pd.DataFrame], float] +) -> pd.Series: + """Calculate aggregate error metrics for a dataframe or record groups within it. + + Args: + df: The dataframe to calculate the error metric from. + gb_col: The column to group by, if calculating errors by group, e.g. + ``report_year``. If None, calculate the error metric for full dataframe. + metric: The function that calculates the error metric. Takes a dataframe and + returns a scalar (float). + + Returns: + A Series. If gb_col is not None and it is not in the columns of the input + dataframe, an empty series is returned. + """ + if gb_col is None: + return pd.Series(metric(df)) + if gb_col in df.columns: + return df.groupby(gb_col).apply(metric) + return pd.Series() + + +def aggregate_error_matrix( + df: pd.DataFrame, + metrics: list[Callable[[pd.DataFrame], float]], + gb_cols: list[str | None], +) -> dict[str, dict[str, pd.Series]]: + """Calculate a suite of error metrics on various data groupings.""" + assert _is_valid_error_df(df) + if metrics is None: + metrics = [ + error_frequency, + error_relative_magnitude, + error_absolute_magnitude, + null_calculation_frequency, + null_reported_value_frequency, + # Still need to write these + # duplicate_value_frequency, + # off_by_one_dollar_frequency, + ] + if gb_cols is None: + gb_cols = [ + None, + "report_year", + "utility_id_ferc1", + "table_name", + "xbrl_factoid", + ] + agg_errors = {} + for metric in metrics: + agg_errors[metric.__name__] = {} + logger.info(f"Calculating {metric.__name__} by:") + for gb_col in gb_cols: + logger.info(f" {gb_col}") + agg_errors[metric.__name__][gb_col] = aggregate_errors(df, gb_col, metric) + return agg_errors + + def add_corrections( calculated_df: pd.DataFrame, value_col: str, + calculation_tolerance: CalculationTolerance, table_name: str, ) -> pd.DataFrame: """Add corrections to discrepancies between reported & calculated values. @@ -1066,31 +1234,41 @@ def add_corrections( ``_correction`` factoids that are added here have already been added to the calculation components during the metadata processing. - TODO: Pass in :class:`CalculationTolerance` and use rtol & atol in np.isclose(). - Args: calculated_df: DataFrame containing the data to correct. Must already have ``abs_diff`` column that was added by :func:`check_calculation_metrics` value_col: Label of the column whose values are being calculated. + calculation_tolerance: Data structure containing various calculation tolerances. table_name: Name of the table whose data we are working with. For logging. """ corrections = calculated_df[ - ~np.isclose(calculated_df["calculated_amount"], calculated_df[value_col]) + ~np.isclose( + calculated_df["calculated_amount"], + calculated_df[value_col], + rtol=calculation_tolerance.isclose_rtol, + atol=calculation_tolerance.isclose_atol, + ) & (calculated_df["abs_diff"].notnull()) ] - logger.info(f"{len(corrections)=}") + corrections[value_col] = ( + corrections[value_col].fillna(0.0) - corrections["calculated_amount"] + ) corrections = corrections.assign( - value_col=lambda x: x[value_col].fillna(0.0) - x["calculated_amount"], - original_factoid=lambda x: x["xbrl_factoid"], + xbrl_factoid_corrected=lambda x: x["xbrl_factoid"], xbrl_factoid=lambda x: x["xbrl_factoid"] + "_correction", row_type_xbrl="correction", is_within_table_calc=False, record_id=pd.NA, ) + num_notnull_calcs = sum(calculated_df["abs_diff"].notnull()) + num_corrections = corrections.shape[0] + num_records = calculated_df.shape[0] + corrected_fraction = num_corrections / num_notnull_calcs logger.info( - f"{table_name}: adding {len(corrections)} corrections to " - f"{len(calculated_df)} original records." + f"{table_name}: Correcting {corrected_fraction:.2%} of all non-null reported " + f"values ({num_corrections}/{num_notnull_calcs}) out of a total of " + f"{num_records} original records." ) return pd.concat([calculated_df, corrections], axis="index").reset_index() diff --git a/src/pudl/transform/params/ferc1.py b/src/pudl/transform/params/ferc1.py index a389878898..d454d2c518 100644 --- a/src/pudl/transform/params/ferc1.py +++ b/src/pudl/transform/params/ferc1.py @@ -2728,7 +2728,7 @@ "reconcile_table_calculations": { "column_to_check": "ending_balance", "calculation_tolerance": { - "bulk_error_rate": 0.08, + "bulk_error_frequency": 0.08, }, }, }, @@ -3718,7 +3718,7 @@ "reconcile_table_calculations": { "column_to_check": "ending_balance", "calculation_tolerance": { - "bulk_error_rate": 0.08, + "bulk_error_frequency": 0.08, }, }, }, @@ -3987,7 +3987,7 @@ # Note: this table does not currently get exploded. It will require # additional debugging at a later date. "calculation_tolerance": { - "bulk_error_rate": 0.4, + "bulk_error_frequency": 0.4, }, }, }, From fd40b83d24f4de8c986624fc44ab8a1ad3df7092 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Fri, 6 Oct 2023 11:03:51 -0600 Subject: [PATCH 03/37] Fix relative error magnitude calculation & default aggregate_error_matrix args. --- src/pudl/transform/ferc1.py | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 624fe6eb8d..33440329f7 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -1131,13 +1131,10 @@ def error_frequency(df: pd.DataFrame) -> float: def error_relative_magnitude(df: pd.DataFrame) -> float: - """Calculate the mangnitude of the errors relative to the reported value.""" + """Calculate the mangnitude of the errors relative to total reported value.""" try: # Should we be taking the absolute value of the reported column? - return ( - df[df.is_error].abs_diff.sum() - / df[df.is_error]["reported_value"].abs().sum() - ) + return df[df.is_error].abs_diff.sum() / df["reported_value"].abs().sum() except ZeroDivisionError: return np.nan @@ -1150,9 +1147,7 @@ def error_absolute_magnitude(df: pd.DataFrame) -> float: def null_calculation_frequency(df: pd.DataFrame) -> float: """Frequency with which calculated values are null when reported values are not.""" non_null_reported = df["reported_value"].notnull() - logger.info(f"{non_null_reported.sum()=}") null_calculated = df["calculated_amount"].isnull() - logger.info(f"{null_calculated.sum()=}") return (non_null_reported & null_calculated).sum() / non_null_reported.sum() @@ -1162,13 +1157,13 @@ def null_reported_value_frequency(df: pd.DataFrame) -> float: def aggregate_errors( - df: pd.DataFrame, gb_col: str | None, metric: Callable[[pd.DataFrame], float] + df: pd.DataFrame, by: str | None, metric: Callable[[pd.DataFrame], float] ) -> pd.Series: """Calculate aggregate error metrics for a dataframe or record groups within it. Args: df: The dataframe to calculate the error metric from. - gb_col: The column to group by, if calculating errors by group, e.g. + by: The column to group by, if calculating errors by group, e.g. ``report_year``. If None, calculate the error metric for full dataframe. metric: The function that calculates the error metric. Takes a dataframe and returns a scalar (float). @@ -1177,17 +1172,17 @@ def aggregate_errors( A Series. If gb_col is not None and it is not in the columns of the input dataframe, an empty series is returned. """ - if gb_col is None: + if by is None: return pd.Series(metric(df)) - if gb_col in df.columns: - return df.groupby(gb_col).apply(metric) + if by in df.columns: + return df.groupby(by=by).apply(metric) return pd.Series() def aggregate_error_matrix( df: pd.DataFrame, - metrics: list[Callable[[pd.DataFrame], float]], - gb_cols: list[str | None], + metrics: list[Callable[[pd.DataFrame], float]] | None = None, + by_cols: list[str | None] | None = None, ) -> dict[str, dict[str, pd.Series]]: """Calculate a suite of error metrics on various data groupings.""" assert _is_valid_error_df(df) @@ -1202,8 +1197,8 @@ def aggregate_error_matrix( # duplicate_value_frequency, # off_by_one_dollar_frequency, ] - if gb_cols is None: - gb_cols = [ + if by_cols is None: + by_cols = [ None, "report_year", "utility_id_ferc1", @@ -1214,9 +1209,9 @@ def aggregate_error_matrix( for metric in metrics: agg_errors[metric.__name__] = {} logger.info(f"Calculating {metric.__name__} by:") - for gb_col in gb_cols: - logger.info(f" {gb_col}") - agg_errors[metric.__name__][gb_col] = aggregate_errors(df, gb_col, metric) + for by in by_cols: + logger.info(f" {by}") + agg_errors[metric.__name__][by] = aggregate_errors(df, by, metric) return agg_errors From ac93f1e58d2fda4f19250e287eaef9a883d57140 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Fri, 6 Oct 2023 12:35:34 -0600 Subject: [PATCH 04/37] Uncomment stuff in reconcile_table_calculations() --- src/pudl/transform/ferc1.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 33440329f7..ebf5e3967f 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -915,13 +915,13 @@ def reconcile_table_calculations( calculation_tolerance=params.calculation_tolerance, table_name=table_name, ) - # calculated_df = add_corrections( - # calculated_df=calculated_df, - # value_col=params.column_to_check, - # calculation_tolerance=params.calculation_tolerance, - # table_name=table_name, - # ) - # calculated_df = calculated_df.rename(columns={"xbrl_factoid": xbrl_factoid_name}) + calculated_df = add_corrections( + calculated_df=calculated_df, + value_col=params.column_to_check, + calculation_tolerance=params.calculation_tolerance, + table_name=table_name, + ) + calculated_df = calculated_df.rename(columns={"xbrl_factoid": xbrl_factoid_name}) # Check that sub-total calculations sum to total. if params.subtotal_column is not None: @@ -1061,6 +1061,8 @@ def check_calculation_metrics( ) & calculated_df["abs_diff"].notnull() ) + # Uniformity here helps keep the error checking functions simpler: + calculated_df["reported_value"] = calculated_df[value_col] # DO ERROR CHECKS # off_df is pretty specific to the one check that we're doing now, but is also From e1e62c7b88cd06a625e782676c2a8237d2161dcf Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Tue, 10 Oct 2023 11:49:21 -0600 Subject: [PATCH 05/37] Update intertable calc checks to use CalculationTolerance class. --- src/pudl/output/ferc1.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 2a03acf401..23c7d3f43d 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -1348,7 +1348,7 @@ def boom(self: Self, tables_to_explode: dict[str, pd.DataFrame]) -> pd.DataFrame Args: tables_to_explode: dictionary of table name (key) to transfomed table (value). calculation_tolerance: What proportion (0-1) of calculated values are - allowed to be incorrect without raising an AssertionError. + allowed to be incorrect without raising an AssertionError. """ exploded = ( self.initial_explosion_concatenation(tables_to_explode) @@ -1483,9 +1483,8 @@ def reconcile_intertable_calculations( subtotal_calcs = pudl.transform.ferc1.check_calculation_metrics( calculated_df=subtotal_calcs, value_col=self.value_col, - calculation_tolerance=self.calculation_tolerance.intertable_calculation_errors, + calculation_tolerance=self.calculation_tolerance, table_name=self.root_table, - add_corrections=True, ) return calculated_df From bd4b90249859fbd30e2efa0098affc031c309fcf Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Tue, 10 Oct 2023 15:41:10 -0600 Subject: [PATCH 06/37] Remove cruft and break up reconcile_table_calculations() --- src/pudl/transform/ferc1.py | 274 +++++++++++++++++++++--------------- 1 file changed, 164 insertions(+), 110 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 220d21dc08..48b0adb3c0 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -766,31 +766,30 @@ class CalculationTolerance(TransformParams): bulk_error_frequency: confloat(ge=0.0, le=1.0) = 0.05 """Fraction of all calculations that are allowed to not match exactly.""" + utility_id_ferc1_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 + report_year_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 + xbrl_factoid_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 + table_name_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 + bulk_error_relative_magnitude: confloat(ge=0.0) = 0.01 """Maximum allowed sum of all errors, relative to the sum of all reported values.""" - bulk_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.05 - """Fraction of records with non-null reported values and null calculated values.""" + utility_id_ferc1_error_relative_magnitude: confloat(ge=0.0) = 0.001 + report_year_error_relative_magnitude: confloat(ge=0.0) = 0.001 + xbrl_factoid_error_relative_magnitude: confloat(ge=0.0) = 0.001 + table_name_error_relative_magnitude: confloat(ge=0.0) = 0.001 - bulk_null_reported_value_frequency: confloat(ge=0.0, le=1.0) = 0.50 + bulk_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.05 """Fraction of records with non-null reported values and null calculated values.""" - utility_id_ferc1_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 - utility_id_ferc1_error_relative_magnitude: confloat(ge=0.0) = 0.001 utility_id_ferc1_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 - - report_year_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 - report_year_error_relative_magnitude: confloat(ge=0.0) = 0.001 report_year_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 - - xbrl_factoid_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 - xbrl_factoid_error_relative_magnitude: confloat(ge=0.0) = 0.001 xbrl_factoid_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 - - table_name_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 - table_name_error_relative_magnitude: confloat(ge=0.0) = 0.001 table_name_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 + bulk_null_reported_value_frequency: confloat(ge=0.0, le=1.0) = 0.50 + """Fraction of records with non-null reported values and null calculated values.""" + class ReconcileTableCalculations(TransformParams): """Parameters for reconciling xbrl-metadata based calculations within a table.""" @@ -837,130 +836,185 @@ def reconcile_table_calculations( Args: df: processed table containing data values to check. calculation_components: processed calculation component metadata. - xbrl_factoid_name: column name of the XBRL factoid in the processed table. - table_name: name of the PUDL table. + xbrl_metadata: A dataframe of fact-level metadata, required for inferring the + sub-dimension total calculations. + xbrl_factoid_name: The name of the column which contains XBRL factoid values in + the processed table. + table_name: name of the PUDL table whose data and metadata is being processed. + This is necessary so we can ensure the metadata has the same structure as + the calculation components, which at a minimum need both ``table_name`` and + ``xbrl_factoid`` to identify them. params: :class:`ReconcileTableCalculations` parameters. + + Returns: + A dataframe that includes new *_correction records with values that ensure the + calculations all match to within the required tolerance. It will also contain + columns created by the calculation checking process like ``abs_diff`` and + ``rel_diff``. """ # If we don't have this value, we aren't doing any calculation checking: if params.column_to_check is None or calculation_components.empty: return df - # we only want to check calucations that are fully within this table - intra_tbl_calcs = calculation_components[ + + # Use the calculation components which reference ONLY values within the table + intra_table_calcs = calculation_components[ calculation_components.is_within_table_calc - & calculation_components.xbrl_factoid.notnull() # no nulls bc we have all parents ] + # To interact with the calculation components, we need uniformly named columns + # for xbrl_factoid, and table_name df = df.rename(columns={xbrl_factoid_name: "xbrl_factoid"}).assign( table_name=table_name ) - # !!! Add dimensions into the calculation components!!! - # First determine what dimensions matter in this table: - # usually you can rely on params.subtotal_column to get THE ONE dimension in the - # table... BUT some tables have more than one dimension so we grab from all of the - # the dims in the transformers. AAAND occasionally the factoid_name is in the dims - # wild. i know. so we are grabbing all of the non-factoid dimensions that show up - # in the data. dim_cols = [ - d - for d in other_dimensions(table_names=list(FERC1_TFR_CLASSES)) - if d in df.columns and d != xbrl_factoid_name + dim + for dim in other_dimensions(table_names=list(FERC1_TFR_CLASSES)) + if dim in df.columns ] calc_idx = ["xbrl_factoid", "table_name"] + dim_cols if dim_cols: - table_dims = ( - df[calc_idx].drop_duplicates(keep="first").assign(table_name=table_name) - ) - # need to add in the correction dimensions. they don't show up in the data at - # this point so we don't have the dimensions yet. NOTE: this could have been - # done by adding the dims into table_dims..... maybe would have been more - # straightforward - correction_mask = intra_tbl_calcs.xbrl_factoid.str.contains("_correction") - intra_tbl_calcs = pd.concat( - [ - intra_tbl_calcs[~correction_mask], - pd.merge( - intra_tbl_calcs[correction_mask].drop(columns=dim_cols), - table_dims[["table_name"] + dim_cols].drop_duplicates(), - on=["table_name"], - ), - ] + table_dims = df[calc_idx].drop_duplicates(keep="first") + intra_table_calcs = _add_intra_table_calculation_dimensions( + intra_table_calcs=intra_table_calcs, + table_dims=table_dims, + dim_cols=dim_cols, + ) + # Check the subdimension totals, but don't add correction records for these + # intra-fact calculations: + _check_subtotal_calculations( + df=df, + intra_table_calcs=intra_table_calcs, + xbrl_metadata=xbrl_metadata, + params=params, + dim_cols=dim_cols, + table_name=table_name, + table_dims=table_dims, + calc_idx=calc_idx, ) - intra_tbl_calcs = make_calculation_dimensions_explicit( - intra_tbl_calcs, - table_dimensions_ferc1=table_dims, - dimensions=dim_cols, - ).pipe( - assign_parent_dimensions, - table_dimensions=table_dims, - dimensions=dim_cols, - ) - # this is for the income statement table specifically, but is general: - # remove all the bits where we have a child dim but not a parent dim - # sometimes there are child dimensions that have utility_type == "other2" etc - # where the parent dimension has nothing - for dim in dim_cols: - intra_tbl_calcs = intra_tbl_calcs[ - ~( - intra_tbl_calcs[dim].notnull() - & intra_tbl_calcs[f"{dim}_parent"].isnull() - ) - ] - calculated_df = calculate_values_from_components( - data=df, - calculation_components=intra_tbl_calcs, - calc_idx=calc_idx, - value_col=params.column_to_check, - ) - calculated_df = check_calculation_metrics( - calculated_df=calculated_df, - value_col=params.column_to_check, - calculation_tolerance=params.calculation_tolerance, - table_name=table_name, - ) - calculated_df = add_corrections( - calculated_df=calculated_df, - value_col=params.column_to_check, - calculation_tolerance=params.calculation_tolerance, - table_name=table_name, - ) - calculated_df = calculated_df.rename(columns={"xbrl_factoid": xbrl_factoid_name}) - # Check that sub-total calculations sum to total. - if params.subtotal_column is not None: - logger.info( - f"Checking total-to-subtotal calculations within {params.subtotal_column}" - ) - meta_w_dims = xbrl_metadata.assign( - **{dim: pd.NA for dim in dim_cols} | {"table_name": table_name} - ).pipe( - make_calculation_dimensions_explicit, - table_dimensions_ferc1=table_dims, - dimensions=dim_cols, - ) - calc_comps_w_totals = infer_intra_factoid_totals( - intra_tbl_calcs, - meta_w_dims=meta_w_dims, - table_dimensions=table_dims, - dimensions=dim_cols, - ) - subtotal_calcs = calculate_values_from_components( + calculated_df = ( + calculate_values_from_components( data=df, - calculation_components=calc_comps_w_totals[ - calc_comps_w_totals.is_total_to_subdimensions_calc - ], + calculation_components=intra_table_calcs, calc_idx=calc_idx, value_col=params.column_to_check, ) - subtotal_calcs = check_calculation_metrics( - calculated_df=subtotal_calcs, + .pipe( + check_calculation_metrics, + value_col=params.column_to_check, + calculation_tolerance=params.calculation_tolerance, + table_name=table_name, + ) + .pipe( + add_corrections, value_col=params.column_to_check, calculation_tolerance=params.calculation_tolerance, table_name=table_name, - ).rename(columns={"xbrl_factoid": xbrl_factoid_name}) + ) + # Rename back to the original xbrl_factoid column name before returning: + .rename(columns={"xbrl_factoid": xbrl_factoid_name}) + ) return calculated_df +def _check_subtotal_calculations( + df: pd.DataFrame, + intra_table_calcs: pd.DataFrame, + xbrl_metadata: pd.DataFrame, + params: "Ferc1TableTransformParams", + dim_cols: list[str], + table_name: str, + table_dims: pd.DataFrame, + calc_idx: list[str], +) -> None: + """Check that sub-dimension calculations sum to the reported totals. + + No correction records are added to the sub-dimensions calculations. This is only an + error check, and returns nothing. + """ + if params.subtotal_column is None: + return + logger.info(f"Checking total-to-subtotal calculations in {params.subtotal_column}") + meta_w_dims = xbrl_metadata.assign( + **{dim: pd.NA for dim in dim_cols} | {"table_name": table_name} + ).pipe( + make_calculation_dimensions_explicit, + table_dimensions_ferc1=table_dims, + dimensions=dim_cols, + ) + calc_comps_w_totals = infer_intra_factoid_totals( + intra_table_calcs, + meta_w_dims=meta_w_dims, + table_dimensions=table_dims, + dimensions=dim_cols, + ) + subtotal_calcs = calculate_values_from_components( + data=df, + calculation_components=calc_comps_w_totals[ + calc_comps_w_totals.is_total_to_subdimensions_calc + ], + calc_idx=calc_idx, + value_col=params.column_to_check, + ) + subtotal_calcs = check_calculation_metrics( + calculated_df=subtotal_calcs, + value_col=params.column_to_check, + calculation_tolerance=params.calculation_tolerance, + table_name=table_name, + ) + + +def _add_intra_table_calculation_dimensions( + intra_table_calcs: pd.DataFrame, + table_dims: pd.DataFrame, + dim_cols: list[str], +) -> pd.DataFrame: + """Add all observed subdimensions into the calculation components.""" + ######## Add all observed subdimensions into the calculation components!!! + # First determine what dimensions matter in this table: + # - usually params.subtotal_column has THE ONE dimension in the table... + # - BUT some tables have more than one dimension so we grab from all of the + # the dims in the transformers. + + # need to add in the correction dimensions. they don't show up in the data at + # this point so we don't have the dimensions yet. NOTE: this could have been + # done by adding the dims into table_dims..... maybe would have been more + # straightforward + correction_mask = intra_table_calcs.xbrl_factoid.str.endswith("_correction") + intra_table_calcs = pd.concat( + [ + intra_table_calcs[~correction_mask], + pd.merge( + intra_table_calcs[correction_mask].drop(columns=dim_cols), + table_dims[["table_name"] + dim_cols].drop_duplicates(), + on=["table_name"], + ), + ] + ) + intra_table_calcs = make_calculation_dimensions_explicit( + intra_table_calcs, + table_dimensions_ferc1=table_dims, + dimensions=dim_cols, + ).pipe( + assign_parent_dimensions, + table_dimensions=table_dims, + dimensions=dim_cols, + ) + # this is for the income statement table specifically, but is general: + # remove all the bits where we have a child dim but not a parent dim + # sometimes there are child dimensions that have utility_type == "other2" etc + # where the parent dimension has nothing + for dim in dim_cols: + intra_table_calcs = intra_table_calcs[ + ~( + intra_table_calcs[dim].notnull() + & intra_table_calcs[f"{dim}_parent"].isnull() + ) + ] + return intra_table_calcs + + def calculate_values_from_components( calculation_components: pd.DataFrame, data: pd.DataFrame, From ef52b3fb93137f8fb31877a719925eda893208ad Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Tue, 10 Oct 2023 21:07:42 -0600 Subject: [PATCH 07/37] Rename calculated_amount to calculated_value for consistency --- src/pudl/output/ferc1.py | 2 +- src/pudl/transform/ferc1.py | 30 +++++++++++++++--------------- test/unit/transform/ferc1_test.py | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 23c7d3f43d..7a2a7ab145 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -1435,7 +1435,7 @@ def reconcile_intertable_calculations( components originate entirely or partially outside of the table. It also accounts for components that only sum to a factoid within a particular dimension (e.g., for an electric utility or for plants whose plant_function is - "in_service"). This returns a dataframe with a "calculated_amount" column. + "in_service"). This returns a dataframe with a "calculated_value" column. Args: exploded: concatenated tables for table explosion. diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 48b0adb3c0..33ba876571 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -1061,8 +1061,8 @@ def calculate_values_from_components( on=calc_idx, ) # apply the weight from the calc to convey the sign before summing. - .assign(calculated_amount=lambda x: x[value_col] * x.weight) - .groupby(gby_parent, as_index=False, dropna=False)[["calculated_amount"]] + .assign(calculated_value=lambda x: x[value_col] * x.weight) + .groupby(gby_parent, as_index=False, dropna=False)[["calculated_value"]] .sum(min_count=1) ) # remove the _parent suffix so we can merge these calculated values back onto @@ -1095,13 +1095,13 @@ def check_calculation_metrics( ) -> pd.DataFrame: """Run the calculation metrics and determine if calculations are within tolerance.""" # Data types were very messy here, including pandas Float64 for the - # calculated_amount columns which did not work with the np.isclose(). Not sure + # calculated_value columns which did not work with the np.isclose(). Not sure # why these are cropping up. calculated_df = calculated_df.convert_dtypes(convert_floating=False).astype( - {value_col: "float64", "calculated_amount": "float64"} + {value_col: "float64", "calculated_value": "float64"} ) calculated_df = calculated_df.assign( - abs_diff=lambda x: abs(x[value_col] - x.calculated_amount), + abs_diff=lambda x: abs(x[value_col] - x.calculated_value), rel_diff=lambda x: np.where( (x[value_col] != 0.0), abs(x.abs_diff / x[value_col]), @@ -1110,7 +1110,7 @@ def check_calculation_metrics( ) calculated_df["is_error"] = ( ~np.isclose( - calculated_df["calculated_amount"], + calculated_df["calculated_value"], calculated_df[value_col], rtol=calculation_tolerance.isclose_rtol, atol=calculation_tolerance.isclose_atol, @@ -1133,7 +1133,7 @@ def check_calculation_metrics( all_errors = calculated_df[calculated_df["is_error"]] # Why does it make sense to compare against records where abs_diff is non-null? - # Why wouldn't we want to look at records where calculated_amount is non-null? + # Why wouldn't we want to look at records where calculated_value is non-null? non_null_calculated_df = calculated_df.dropna(subset="abs_diff") if non_null_calculated_df.empty: # Will only occur if all reported values are NaN when calculated values @@ -1169,7 +1169,7 @@ def _is_valid_error_df(df: pd.DataFrame) -> bool: required_cols = [ "is_error", "reported_value", - "calculated_amount", + "calculated_value", "abs_diff", ] return all(col in df.columns for col in required_cols) @@ -1188,7 +1188,7 @@ def error_frequency(df: pd.DataFrame) -> float: return np.nan -def error_relative_magnitude(df: pd.DataFrame) -> float: +def relative_error_magnitude(df: pd.DataFrame) -> float: """Calculate the mangnitude of the errors relative to total reported value.""" try: # Should we be taking the absolute value of the reported column? @@ -1197,7 +1197,7 @@ def error_relative_magnitude(df: pd.DataFrame) -> float: return np.nan -def error_absolute_magnitude(df: pd.DataFrame) -> float: +def absolute_error_magnitude(df: pd.DataFrame) -> float: """Calculate the absolute magnitude of the errors in aggregate.""" return df.abs_diff.sum() @@ -1205,7 +1205,7 @@ def error_absolute_magnitude(df: pd.DataFrame) -> float: def null_calculation_frequency(df: pd.DataFrame) -> float: """Frequency with which calculated values are null when reported values are not.""" non_null_reported = df["reported_value"].notnull() - null_calculated = df["calculated_amount"].isnull() + null_calculated = df["calculated_value"].isnull() return (non_null_reported & null_calculated).sum() / non_null_reported.sum() @@ -1247,8 +1247,8 @@ def aggregate_error_matrix( if metrics is None: metrics = [ error_frequency, - error_relative_magnitude, - error_absolute_magnitude, + relative_error_magnitude, + absolute_error_magnitude, null_calculation_frequency, null_reported_value_frequency, # Still need to write these @@ -1296,7 +1296,7 @@ def add_corrections( """ corrections = calculated_df[ ~np.isclose( - calculated_df["calculated_amount"], + calculated_df["calculated_value"], calculated_df[value_col], rtol=calculation_tolerance.isclose_rtol, atol=calculation_tolerance.isclose_atol, @@ -1305,7 +1305,7 @@ def add_corrections( ] corrections[value_col] = ( - corrections[value_col].fillna(0.0) - corrections["calculated_amount"] + corrections[value_col].fillna(0.0) - corrections["calculated_value"] ) corrections = corrections.assign( xbrl_factoid_corrected=lambda x: x["xbrl_factoid"], diff --git a/test/unit/transform/ferc1_test.py b/test/unit/transform/ferc1_test.py index a942c436c3..01713177d8 100644 --- a/test/unit/transform/ferc1_test.py +++ b/test/unit/transform/ferc1_test.py @@ -436,7 +436,7 @@ def test_calculate_values_from_components(): expected_ksr = pd.read_csv( StringIO( f""" -table_name,xbrl_factoid,planet,value,utility_id_ferc1,report_year,calculated_amount +table_name,xbrl_factoid,planet,value,utility_id_ferc1,report_year,calculated_value books,lil_fact_x,venus,10.0,44,2312, books,lil_fact_z,venus,11.0,44,2312, books,lil_fact_y,venus,12.0,44,2312, From c4db47e02a35a1a7174dd0551f1db386eb98f10a Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Tue, 10 Oct 2023 21:18:33 -0600 Subject: [PATCH 08/37] Fix deprecated dtype checking method --- src/pudl/metadata/classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py index ea89b5ce04..e4915988c5 100644 --- a/src/pudl/metadata/classes.py +++ b/src/pudl/metadata/classes.py @@ -1466,7 +1466,7 @@ def format_df(self, df: pd.DataFrame | None = None, **kwargs: Any) -> pd.DataFra and pd.api.types.is_integer_dtype(df[field.name]) ): df[field.name] = pd.to_datetime(df[field.name], format="%Y") - if pd.api.types.is_categorical_dtype(dtypes[field.name]): + if isinstance(dtypes[field.name], pd.CategoricalDtype): uncategorized = [ value for value in df[field.name].dropna().unique() From 298e5fb973b70d37d8da61ad32ee798d519e5b38 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Tue, 10 Oct 2023 21:19:37 -0600 Subject: [PATCH 09/37] Catch ZeroDivisionError exception in add_corrections() --- src/pudl/transform/ferc1.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 33ba876571..d85aeda509 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -1317,7 +1317,10 @@ def add_corrections( num_notnull_calcs = sum(calculated_df["abs_diff"].notnull()) num_corrections = corrections.shape[0] num_records = calculated_df.shape[0] - corrected_fraction = num_corrections / num_notnull_calcs + try: + corrected_fraction = num_corrections / num_notnull_calcs + except ZeroDivisionError: + corrected_fraction = np.nan logger.info( f"{table_name}: Correcting {corrected_fraction:.2%} of all non-null reported " f"values ({num_corrections}/{num_notnull_calcs}) out of a total of " From 48d24e98853140db85b04ac8daf262263409cb7e Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Mon, 16 Oct 2023 10:48:01 -0600 Subject: [PATCH 10/37] WIP: calculation checks draft. --- src/pudl/transform/ferc1.py | 339 +++++++++++++++++++++++------------- 1 file changed, 217 insertions(+), 122 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index d85aeda509..e20b83d56c 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -12,8 +12,9 @@ import itertools import json import re +from abc import abstractmethod from collections import namedtuple -from collections.abc import Callable, Mapping +from collections.abc import Mapping from typing import Any, Literal, Self import numpy as np @@ -898,21 +899,20 @@ def reconcile_table_calculations( calculation_components=intra_table_calcs, calc_idx=calc_idx, value_col=params.column_to_check, - ) - .pipe( + ).pipe( check_calculation_metrics, value_col=params.column_to_check, calculation_tolerance=params.calculation_tolerance, table_name=table_name, ) - .pipe( - add_corrections, - value_col=params.column_to_check, - calculation_tolerance=params.calculation_tolerance, - table_name=table_name, - ) + # .pipe( + # add_corrections, + # value_col=params.column_to_check, + # calculation_tolerance=params.calculation_tolerance, + # table_name=table_name, + # ) # Rename back to the original xbrl_factoid column name before returning: - .rename(columns={"xbrl_factoid": xbrl_factoid_name}) + # .rename(columns={"xbrl_factoid": xbrl_factoid_name}) ) return calculated_df @@ -1108,15 +1108,6 @@ def check_calculation_metrics( np.nan, ), ) - calculated_df["is_error"] = ( - ~np.isclose( - calculated_df["calculated_value"], - calculated_df[value_col], - rtol=calculation_tolerance.isclose_rtol, - atol=calculation_tolerance.isclose_atol, - ) - & calculated_df["abs_diff"].notnull() - ) # Uniformity here helps keep the error checking functions simpler: calculated_df["reported_value"] = calculated_df[value_col] @@ -1130,26 +1121,26 @@ def check_calculation_metrics( # some tolerance (defined here by the np.isclose defaults). Ignore records for # which the absolute difference "abs_dfiff" is NA since that indicates one or the # other of the calculated or reported values was NA. - all_errors = calculated_df[calculated_df["is_error"]] + # all_errors = calculated_df[calculated_df["is_error"]] # Why does it make sense to compare against records where abs_diff is non-null? # Why wouldn't we want to look at records where calculated_value is non-null? - non_null_calculated_df = calculated_df.dropna(subset="abs_diff") - if non_null_calculated_df.empty: - # Will only occur if all reported values are NaN when calculated values - # exist, or vice versa. - logger.warning( - "Calculated values have no corresponding reported values in this table." - ) - bulk_error_frequency = np.nan - else: - bulk_error_frequency = len(all_errors) / len(non_null_calculated_df) - - if bulk_error_frequency > calculation_tolerance.bulk_error_frequency: - raise AssertionError( - f"Bulk error rate of calculations in {table_name} is {bulk_error_frequency:.2%}. " - f"Accepted tolerance is {calculation_tolerance.bulk_error_frequency:.1%}." - ) + # non_null_calculated_df = calculated_df.dropna(subset="abs_diff") + # if non_null_calculated_df.empty: + # # Will only occur if all reported values are NaN when calculated values + # # exist, or vice versa. + # logger.warning( + # "Calculated values have no corresponding reported values in this table." + # ) + # bulk_error_frequency = np.nan + # else: + # bulk_error_frequency = len(all_errors) / len(non_null_calculated_df) + + # if bulk_error_frequency > calculation_tolerance.bulk_error_frequency: + # raise AssertionError( + # f"Bulk error rate of calculations in {table_name} is {bulk_error_frequency:.2%}. " + # f"Accepted tolerance is {calculation_tolerance.bulk_error_frequency:.1%}." + # ) # Given a particular set of groupby() columns, calculate the fraction of records in # each group whose calculations do not match reported values based on np.isclose() @@ -1164,113 +1155,217 @@ def check_calculation_metrics( # - They require a uniform `reported_value` column so that they can all have the same # call signature, which allows us to iterate over all of them in a matrix. ######################################################################################## -def _is_valid_error_df(df: pd.DataFrame) -> bool: - """Helper function that verifies a dataframe is ready for error checking.""" - required_cols = [ - "is_error", +class CalcTol(TransformParams): + """A simple class to contain an error tolerance specification.""" + + by: str | list[str] | None = None + name: str | None = None + tol: float + + @validator("name", always=True) + def name_by(v, values): + """Set the name to be the same as by if name is none and by is string.""" + if isinstance(v, str): + return v + + if v is None and isinstance(values["by"], str): + return values["by"] + + raise ValueError("Must give tolerance a name if by is a list.") + + +class CalcCheck(TransformParams): + """Per metric calculation checks.""" + + isclose_rtol: confloat(ge=0.0) = 1e-5 + """Relative tolerance to use in :func:`np.isclose` for determining equality.""" + + isclose_atol: confloat(ge=0.0, le=0.01) = 1e-8 + """Absolute tolerance to use in :func:`np.isclose` for determining equality. + + Since we are comparing financial values, and we want them to be equal only insofar + as they would be financially indistinguishable, they should never differ by more + than a penny, hence the upper bound of 0.01. + """ + + required_cols: list[str] = [ + "table_name", + "xbrl_factoid", + "report_year", + "utility_id_ferc1", "reported_value", "calculated_value", "abs_diff", + "rel_diff", ] - return all(col in df.columns for col in required_cols) + tols: list[CalcTol] = [] + """List of tolrances to check. -def error_frequency(df: pd.DataFrame) -> float: - """Calculate the frequency with which records are tagged as errors.""" - try: - return df[df.is_error].shape[0] / df.shape[0] - except ZeroDivisionError: - # Will only occur if all reported values are NaN when calculated values - # exist, or vice versa. - logger.warning( - "Calculated values have no corresponding reported values in this table." + Not totally satisfied here. It would be nice to be able to refer to the specific + tolerances by name, since they're pretty well defined now, and we know that there + are some testable properties relating them to each other. + + At the same time it's nice to have this be a generic list of whatever groupby + elements you want to check, since there could be special cases for different + tables or groups of tables and it would be nice to avoid hard coding that into + the classes. + """ + + # @validator("utility_id_ferc1", "report_year", "table_name", "xbrl_factoid") + # def grouped_tol_ge_ungrouped_tol(v, values): + # """Grouped tolerance should always be greater than or equal to ungrouped.""" + # if v is not None: + # assert v >= values["ungrouped"] + # return v + + def _is_valid_error_df(self: Self, df: pd.DataFrame): + """Check that the input dataframe has all required columns.""" + return all(col in df.columns for col in self.required_cols) + + @abstractmethod + def metric(self: Self, df: pd.DataFrame) -> float: + """The error metric to calculate for a dataframe or group.""" + ... + + def is_error(self: Self, df: pd.DataFrame) -> pd.Series: + """Flag those records that qualify as an error. + + NOTE: How should this categorization vary from metric to metric? Why is it + reasonable to only look at records where ``abs_diff`` is not Null? + """ + return pd.Series( + ~np.isclose( + df["calculated_value"], + df["reported_value"], + rtol=self.isclose_rtol, + atol=self.isclose_atol, + ) + & df["abs_diff"].notnull() ) - return np.nan + def report(self: Self, df: pd.DataFrame) -> dict[str, pd.Series]: + """Report the full distribution of errors by group.""" + assert self._is_valid_error_df(df) + df.loc[:, "is_error"] = self.is_error(df) + error_report = {} + for tol in self.tols: + if tol.by == "ungrouped": + errors = pd.Series(self.metric(df)) + else: + errors = df.groupby(by=tol.by).apply(self.metric) + error_report[tol.name] = errors + return error_report + + def check(self: Self, df: pd.DataFrame) -> bool: + """Check metric for all groups and their tolerances.""" + error_report = self.report(df) + message = "" + error_message = "" + valid = True + for tol in self.tols: + max_error = error_report[tol.name].max() + msg = ( + f"{tol.name} max error: {max_error:0.6f}; " + f"tolerance: {tol.tol:0.6f}\n" + ) + if max_error > tol.tol: + valid = False + error_message += msg + else: + message += msg + logger.info(message) + if not valid: + raise AssertionError(message) -def relative_error_magnitude(df: pd.DataFrame) -> float: - """Calculate the mangnitude of the errors relative to total reported value.""" - try: - # Should we be taking the absolute value of the reported column? - return df[df.is_error].abs_diff.sum() / df["reported_value"].abs().sum() - except ZeroDivisionError: - return np.nan +class ErrorFrequency(CalcCheck): + """Check error frequency in XBRL calculations.""" -def absolute_error_magnitude(df: pd.DataFrame) -> float: - """Calculate the absolute magnitude of the errors in aggregate.""" - return df.abs_diff.sum() + tols: list[CalcTol] = [ + CalcTol(by="ungrouped", tol=0.01), + CalcTol(by="table_name", tol=0.1), + CalcTol(by="xbrl_factoid", tol=0.1), + CalcTol(by="utility_id_ferc1", tol=0.1), + CalcTol(by="report_year", tol=0.1), + ] + def metric(self: Self, df: pd.DataFrame) -> float: + """Calculate the frequency with which records are tagged as errors.""" + try: + return df[df.is_error].shape[0] / df.shape[0] + except ZeroDivisionError: + # Will only occur if all reported values are NaN when calculated values + # exist, or vice versa. + logger.warning( + "Calculated values have no corresponding reported values in this table." + ) + return np.nan -def null_calculation_frequency(df: pd.DataFrame) -> float: - """Frequency with which calculated values are null when reported values are not.""" - non_null_reported = df["reported_value"].notnull() - null_calculated = df["calculated_value"].isnull() - return (non_null_reported & null_calculated).sum() / non_null_reported.sum() +class RelativeErrorMagnitude(CalcCheck): + """Check relative magnitude of errors in XBRL calculations.""" -def null_reported_value_frequency(df: pd.DataFrame) -> float: - """Frequency with which the reported values are Null.""" - return df["reported_value"].isnull().sum() / df.shape[0] + tols: list[CalcTol] = [ + CalcTol(by="ungrouped", tol=0.001), + CalcTol(by="table_name", tol=0.01), + CalcTol(by="xbrl_factoid", tol=0.01), + CalcTol(by="utility_id_ferc1", tol=0.01), + CalcTol(by="report_year", tol=0.01), + ] + def metric(self: Self, df: pd.DataFrame) -> float: + """Calculate the mangnitude of the errors relative to total reported value.""" + try: + # Should we be taking the absolute value of the reported column? + return df[df.is_error].abs_diff.sum() / df["reported_value"].abs().sum() + except ZeroDivisionError: + return np.nan -def aggregate_errors( - df: pd.DataFrame, by: str | None, metric: Callable[[pd.DataFrame], float] -) -> pd.Series: - """Calculate aggregate error metrics for a dataframe or record groups within it. - Args: - df: The dataframe to calculate the error metric from. - by: The column to group by, if calculating errors by group, e.g. - ``report_year``. If None, calculate the error metric for full dataframe. - metric: The function that calculates the error metric. Takes a dataframe and - returns a scalar (float). +class AbsoluteErrorMagnitude(CalcCheck): + """Check relative magnitude of errors in XBRL calculations. - Returns: - A Series. If gb_col is not None and it is not in the columns of the input - dataframe, an empty series is returned. + These numbers may vary wildly from table to table so no default values for the + expected errors are provided here... """ - if by is None: - return pd.Series(metric(df)) - if by in df.columns: - return df.groupby(by=by).apply(metric) - return pd.Series() + def metric(self: Self, df: pd.DataFrame) -> float: + """Calculate the absolute mangnitude of XBRL calculation errors.""" + return df.abs_diff.sum() -def aggregate_error_matrix( - df: pd.DataFrame, - metrics: list[Callable[[pd.DataFrame], float]] | None = None, - by_cols: list[str | None] | None = None, -) -> dict[str, dict[str, pd.Series]]: - """Calculate a suite of error metrics on various data groupings.""" - assert _is_valid_error_df(df) - if metrics is None: - metrics = [ - error_frequency, - relative_error_magnitude, - absolute_error_magnitude, - null_calculation_frequency, - null_reported_value_frequency, - # Still need to write these - # duplicate_value_frequency, - # off_by_one_dollar_frequency, - ] - if by_cols is None: - by_cols = [ - None, - "report_year", - "utility_id_ferc1", - "table_name", - "xbrl_factoid", - ] - agg_errors = {} - for metric in metrics: - agg_errors[metric.__name__] = {} - logger.info(f"Calculating {metric.__name__} by:") - for by in by_cols: - logger.info(f" {by}") - agg_errors[metric.__name__][by] = aggregate_errors(df, by, metric) - return agg_errors + +class NullCalculationFrequency(CalcCheck): + """Check the frequency of null calculated values.""" + + tols: list[CalcTol] = [ + CalcTol(by="ungrouped", tol=1.0), + CalcTol(by="table_name", tol=1.0), + CalcTol(by="xbrl_factoid", tol=1.0), + CalcTol(by="utility_id_ferc1", tol=1.0), + CalcTol(by="report_year", tol=1.0), + ] + + def metric(self: Self, df: pd.DataFrame) -> float: + """Frequency of null calculated values when reported values are not.""" + non_null_reported = df["reported_value"].notnull() + null_calculated = df["calculated_value"].isnull() + try: + return (non_null_reported & null_calculated).sum() / non_null_reported.sum() + except ZeroDivisionError: + return np.nan + + +class NullReportedValueFrequency(CalcCheck): + """Check the frequency of null reported values.""" + + tols: list[CalcTol] = [ + CalcTol(by="ungrouped", tol=0.50), + ] + + def metric(self: Self, df: pd.DataFrame) -> float: + """Frequency with which the reported values are Null.""" + return df["reported_value"].isnull().sum() / df.shape[0] def add_corrections( @@ -1327,7 +1422,7 @@ def add_corrections( f"{num_records} original records." ) - return pd.concat([calculated_df, corrections], axis="index").reset_index() + return pd.concat([calculated_df, corrections], axis="index") class Ferc1TableTransformParams(TableTransformParams): From 66cad5b2270adbf289cb115cee6fed7b0b157daa Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Mon, 16 Oct 2023 16:09:58 -0600 Subject: [PATCH 11/37] Add missing cls option to validator --- src/pudl/transform/ferc1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index e20b83d56c..42098bac2c 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -1163,7 +1163,7 @@ class CalcTol(TransformParams): tol: float @validator("name", always=True) - def name_by(v, values): + def name_by(cls, v, values): """Set the name to be the same as by if name is none and by is string.""" if isinstance(v, str): return v From a5d19690d24ecc9c14b4b8f0842e6d94258a7f07 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Mon, 16 Oct 2023 16:11:02 -0600 Subject: [PATCH 12/37] Enable beta checks in ruff and ignore E226 & E266 --- .pre-commit-config.yaml | 2 +- pyproject.toml | 24 +++++++++++++----------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 248caf4f23..3e11c6ea96 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,7 +29,7 @@ repos: rev: v0.0.292 hooks: - id: ruff - args: [--fix, --exit-non-zero-on-fix] + args: [--fix, --exit-non-zero-on-fix, --preview] # Format the code - repo: https://github.com/psf/black-pre-commit-mirror diff --git a/pyproject.toml b/pyproject.toml index 69d725d02a..7f922f1de3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -191,19 +191,21 @@ select = [ "W", # pycodestyle warnings ] ignore = [ - "D401", # Require imperative mood in docstrings. + "D401", # Require imperative mood in docstrings. "D417", - "E501", # Overlong lines. - "E203", # Space before ':' (black recommends to ignore) - "PD003", # Use of isna rather than isnull - "PD004", # Use of notna rather than notnull - "PD008", # Use of df.at[] rather than df.loc[] - "PD010", # Use of df.stack() - "PD013", # Use of df.unstack() - "PD015", # Use of pd.merge() rather than df.merge() - "PD901", # df as variable name + "E501", # Overlong lines. + "E203", # Space before ':' (black recommends to ignore) + "E226", # Missing whitespace around arithmetic operator + "E266", # Too many leading `#` before block comment + "PD003", # Use of isna rather than isnull + "PD004", # Use of notna rather than notnull + "PD008", # Use of df.at[] rather than df.loc[] + "PD010", # Use of df.stack() + "PD013", # Use of df.unstack() + "PD015", # Use of pd.merge() rather than df.merge() + "PD901", # df as variable name "RET504", # Ignore unnecessary assignment before return - "S101", # Use of assert + "S101", # Use of assert ] # Assume Python 3.11 From f8eeecb80c68a46d28c8c887b7792724593fb020 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Mon, 16 Oct 2023 17:56:58 -0600 Subject: [PATCH 13/37] Fix docstring parsing error. --- src/pudl/transform/ferc1.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 42098bac2c..c32e31012f 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -848,10 +848,10 @@ def reconcile_table_calculations( params: :class:`ReconcileTableCalculations` parameters. Returns: - A dataframe that includes new *_correction records with values that ensure the - calculations all match to within the required tolerance. It will also contain - columns created by the calculation checking process like ``abs_diff`` and - ``rel_diff``. + A dataframe that includes new ``*_correction`` records with values that ensure + the calculations all match to within the required tolerance. It will also + contain columns created by the calculation checking process like ``abs_diff`` + and ``rel_diff``. """ # If we don't have this value, we aren't doing any calculation checking: if params.column_to_check is None or calculation_components.empty: From 9baf37252e588a1a5b8539a1bf8ad4c1a311d69c Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Mon, 16 Oct 2023 18:39:37 -0600 Subject: [PATCH 14/37] Update to ruff v0.1.0 disable --preview --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3e11c6ea96..4ee68f7693 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,10 +26,10 @@ repos: # Formatters: hooks that re-write Python & documentation files #################################################################################### - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.0.292 + rev: v0.1.0 hooks: - id: ruff - args: [--fix, --exit-non-zero-on-fix, --preview] + args: [--fix, --exit-non-zero-on-fix] # Format the code - repo: https://github.com/psf/black-pre-commit-mirror From 866376ce226053b8c747fa483b8a16762ca84ff6 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Mon, 16 Oct 2023 18:41:13 -0600 Subject: [PATCH 15/37] Update allowed ruff versions. --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7f922f1de3..8496ccc82c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -123,7 +123,7 @@ dev = [ "ipdb>=0.13,<0.14", "jedi>=0.18,<0.20", "lxml>=4.6,<4.10", - "ruff>=0.0.287", + "ruff>=0.1,<0.2", "tox>=4,<4.12", "twine>=4,<4.1", ] @@ -148,7 +148,7 @@ test = [ "pytest-mock>=3,<3.12", "pytest>=7,<7.5", "responses>=0.14,<0.24", - "ruff>=0.0.287", + "ruff>=0.1,<0.2", "tox>=4,<4.12", ] datasette = ["datasette>=0.60,<0.65"] From 61e33e6afe0d65ce64f87fc116186f8548d16bd7 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Wed, 18 Oct 2023 00:14:33 -0600 Subject: [PATCH 16/37] Bump black version in pre-commit --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4ee68f7693..5aac248e1a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,7 +33,7 @@ repos: # Format the code - repo: https://github.com/psf/black-pre-commit-mirror - rev: 23.9.1 + rev: 23.10.0 hooks: - id: black language_version: python3.11 From 513cb0b75503409b0ab5d890693767a0203c7163 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Wed, 18 Oct 2023 08:12:12 -0600 Subject: [PATCH 17/37] bump black version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8496ccc82c..f12b9cf744 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -118,7 +118,7 @@ pudl_check_fks = "pudl.etl.check_foreign_keys:main" [project.optional-dependencies] dev = [ - "black>=22,<23.10", + "black>=22,<23.11", "build>=1,<1.1", "ipdb>=0.13,<0.14", "jedi>=0.18,<0.20", From c50fabcc8b6cd55dd07e25c48548879e748e7504 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 18 Oct 2023 10:23:21 -0400 Subject: [PATCH 18/37] very wip version of a group-based calc checks --- src/pudl/output/ferc1.py | 32 ++- src/pudl/transform/ferc1.py | 400 ++++++++++++++++------------- src/pudl/transform/params/ferc1.py | 12 +- 3 files changed, 243 insertions(+), 201 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 677d49c266..7eea29284d 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -15,20 +15,30 @@ from pydantic import BaseModel, validator import pudl -from pudl.transform.ferc1 import CalculationTolerance +from pudl.transform.ferc1 import ( + CalculationChecks, + CalculationGroupChecks, + GroupedCalculationTolerance, +) logger = pudl.logging_helpers.get_logger(__name__) -EXPLOSION_CALCULATION_TOLERANCES: dict[str, CalculationTolerance] = { - "income_statement_ferc1": CalculationTolerance( - bulk_error_frequency=0.20, +EXPLOSION_CALCULATION_TOLERANCES: dict[str, CalculationChecks] = { + "income_statement_ferc1": CalculationChecks( + group_checks=CalculationGroupChecks( + ungrouped=GroupedCalculationTolerance(error_frequency=0.20) + ) ), - "balance_sheet_assets_ferc1": CalculationTolerance( - bulk_error_frequency=0.65, + "balance_sheet_assets_ferc1": CalculationChecks( + group_checks=CalculationGroupChecks( + ungrouped=GroupedCalculationTolerance(error_frequency=0.65) + ) ), - "balance_sheet_liabilities_ferc1": CalculationTolerance( - bulk_error_frequency=0.07, + "balance_sheet_liabilities_ferc1": CalculationChecks( + group_checks=CalculationGroupChecks( + ungrouped=GroupedCalculationTolerance(error_frequency=0.07) + ) ), } @@ -961,7 +971,7 @@ def exploded_table_asset_factory( root_table: str, table_names_to_explode: list[str], seed_nodes: list[NodeId], - calculation_tolerance: CalculationTolerance, + calculation_tolerance: CalculationChecks, io_manager_key: str | None = None, ) -> AssetsDefinition: """Create an exploded table based on a set of related input tables.""" @@ -1091,7 +1101,7 @@ def __init__( calculation_components_xbrl_ferc1: pd.DataFrame, seed_nodes: list[NodeId], tags: pd.DataFrame = pd.DataFrame(), - calculation_tolerance: CalculationTolerance = CalculationTolerance(), + calculation_tolerance: CalculationChecks = CalculationChecks(), ): """Instantiate an Exploder class. @@ -1536,7 +1546,7 @@ class XbrlCalculationForestFerc1(BaseModel): exploded_calcs: pd.DataFrame = pd.DataFrame() seeds: list[NodeId] = [] tags: pd.DataFrame = pd.DataFrame() - calculation_tolerance: CalculationTolerance = CalculationTolerance() + calculation_tolerance: CalculationChecks = CalculationChecks() class Config: """Allow the class to store a dataframe.""" diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index c32e31012f..53170275cd 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -22,7 +22,7 @@ import sqlalchemy as sa from dagster import AssetIn, AssetsDefinition, asset from pandas.core.groupby import DataFrameGroupBy -from pydantic import confloat, validator +from pydantic import BaseModel, confloat, validator import pudl from pudl.analysis.classify_plants_ferc1 import ( @@ -727,7 +727,50 @@ def combine_axis_columns_xbrl( return df -class CalculationTolerance(TransformParams): +class CalcTol(TransformParams): + """A simple class to contain an error tolerance specification.""" + + # TODO: Get rid of this probably? + + tol: float + + # @validator("name", always=True) + # def name_by(cls, v, values): + # """Set the name to be the same as by if name is none and by is string.""" + # if isinstance(v, str): + # return v + + # if v is None and isinstance(values["by"], str): + # return values["by"] + + # raise ValueError("Must give tolerance a name if by is a list.") + + +class TestChecks(TransformParams): + """Info for testing a particular check.""" + + isclose_rtol: confloat(ge=0.0) = 1e-5 + """Relative tolerance to use in :func:`np.isclose` for determining equality.""" + + isclose_atol: confloat(ge=0.0, le=0.01) = 1e-8 + """Absolute tolerance to use in :func:`np.isclose` for determining equality.""" + + +class CalculationTestChecks(TransformParams): + """Calc params organized by check type.""" + + error_frequency: TestChecks = TestChecks() + relative_error_magnitude: TestChecks = TestChecks(isclose_atol=1e-9) + + +class GroupedCalculationTolerance(TransformParams): + """Tolerances for all data checks to be preformed within a grouped df.""" + + error_frequency: confloat(ge=0.0, le=1.0) = 0.01 + relative_error_magnitude: confloat(ge=0.0) = 0.001 + + +class CalculationGroupChecks(TransformParams): """Data quality expectations related to FERC 1 calculations. We are doing a lot of comparisons between calculated and reported values to identify @@ -753,43 +796,40 @@ class CalculationTolerance(TransformParams): want to know about it! """ - isclose_rtol: confloat(ge=0.0) = 1e-5 - """Relative tolerance to use in :func:`np.isclose` for determining equality.""" - - isclose_atol: confloat(ge=0.0, le=0.01) = 1e-8 - """Absolute tolerance to use in :func:`np.isclose` for determining equality. - - Since we are comparing financial values, and we want them to be equal only insofar - as they would be financially indistinguishable, they should never differ by more - than a penny, hence the upper bound of 0.01. - """ + ungrouped: GroupedCalculationTolerance = GroupedCalculationTolerance( + error_frequency=0.05, relative_error_magnitude=0.01 + ) + xbrl_factoid: GroupedCalculationTolerance = GroupedCalculationTolerance() + utility_id_ferc1: GroupedCalculationTolerance = GroupedCalculationTolerance( + relative_error_magnitude=0.004 + ) + report_year: GroupedCalculationTolerance = GroupedCalculationTolerance() + table_name: GroupedCalculationTolerance = GroupedCalculationTolerance() - bulk_error_frequency: confloat(ge=0.0, le=1.0) = 0.05 - """Fraction of all calculations that are allowed to not match exactly.""" + # bulk_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.05 + # """Fraction of records with non-null reported values and null calculated values.""" - utility_id_ferc1_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 - report_year_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 - xbrl_factoid_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 - table_name_error_frequency: confloat(ge=0.0, le=1.0) = 0.1 + # utility_id_ferc1_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 + # report_year_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 + # xbrl_factoid_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 + # table_name_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 - bulk_error_relative_magnitude: confloat(ge=0.0) = 0.01 - """Maximum allowed sum of all errors, relative to the sum of all reported values.""" + # bulk_null_reported_value_frequency: confloat(ge=0.0, le=1.0) = 0.50 + # """Fraction of records with non-null reported values and null calculated values.""" - utility_id_ferc1_error_relative_magnitude: confloat(ge=0.0) = 0.001 - report_year_error_relative_magnitude: confloat(ge=0.0) = 0.001 - xbrl_factoid_error_relative_magnitude: confloat(ge=0.0) = 0.001 - table_name_error_relative_magnitude: confloat(ge=0.0) = 0.001 + # @validator("utility_id_ferc1", "report_year", "table_name", "xbrl_factoid") + # def grouped_tol_ge_ungrouped_tol(v, values): + # """Grouped tolerance should always be greater than or equal to ungrouped.""" + # if v is not None: + # assert v >= values["ungrouped"] + # return v - bulk_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.05 - """Fraction of records with non-null reported values and null calculated values.""" - utility_id_ferc1_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 - report_year_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 - xbrl_factoid_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 - table_name_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 +class CalculationChecks(TransformParams): + """Input for checking calculations organized by group and test.""" - bulk_null_reported_value_frequency: confloat(ge=0.0, le=1.0) = 0.50 - """Fraction of records with non-null reported values and null calculated values.""" + group_checks: CalculationGroupChecks = CalculationGroupChecks() + test_checks: CalculationTestChecks = CalculationTestChecks() class ReconcileTableCalculations(TransformParams): @@ -801,7 +841,7 @@ class ReconcileTableCalculations(TransformParams): This will typically be ``dollar_value`` or ``ending_balance`` column for the income statement and the balance sheet tables. """ - calculation_tolerance: CalculationTolerance = CalculationTolerance() + calculation_checks: CalculationChecks = CalculationChecks() """Fraction of calculated values which we allow not to match reported values.""" subtotal_column: str | None = None @@ -902,13 +942,13 @@ def reconcile_table_calculations( ).pipe( check_calculation_metrics, value_col=params.column_to_check, - calculation_tolerance=params.calculation_tolerance, + calculation_checks=params.calculation_checks, table_name=table_name, ) # .pipe( # add_corrections, # value_col=params.column_to_check, - # calculation_tolerance=params.calculation_tolerance, + # calculation_checks=params.calculation_checks, # table_name=table_name, # ) # Rename back to the original xbrl_factoid column name before returning: @@ -960,7 +1000,7 @@ def _check_subtotal_calculations( subtotal_calcs = check_calculation_metrics( calculated_df=subtotal_calcs, value_col=params.column_to_check, - calculation_tolerance=params.calculation_tolerance, + calculation_checks=params.calculation_checks, table_name=table_name, ) @@ -1090,7 +1130,7 @@ def calculate_values_from_components( def check_calculation_metrics( calculated_df: pd.DataFrame, value_col: str, - calculation_tolerance: CalculationTolerance, + calculation_checks: CalculationChecks, table_name: str, ) -> pd.DataFrame: """Run the calculation metrics and determine if calculations are within tolerance.""" @@ -1112,6 +1152,45 @@ def check_calculation_metrics( calculated_df["reported_value"] = calculated_df[value_col] # DO ERROR CHECKS + results_dfs = {} + # for each groupby grouping: calculate metrics for each test + # then check if each test is within acceptable tolerance levels + for by, group_tols in calculation_checks.group_checks: + if by == "ungrouped": + calculated_df = calculated_df.assign(ungrouped=1) + group_metrics = {} + for test, tol in group_tols: + logger.info(f"{by}: {test}") + title_case_test = test.title().replace("_", "") + tester = locals()[title_case_test]( + by=by, test_checks=calculation_checks.test_checks.dict()[test] + ) + group_metrics[test] = tester.metric(df=calculated_df) + group_check_df = pd.DataFrame(group_metrics) + + for test, tol in calculation_checks.group_checks.dict()[by].items(): + group_check_df = group_check_df.assign( + **{ + f"tolerance_{test}": tol, # tol is just for reporting so you can know of off you are + f"is_error_{test}": group_check_df[test] > tol, + } + ) + results_dfs[by] = group_check_df + # convert all of the grouped checks into a big df. this will have + # two unnamed indexes: first for the group name (by) and one for the + # groups values. the columns will include three for each test: the test mertic + # that is the same name as the test (ex: error_frequency), the tolerance for that + # group/test and a boolean indicating whether or not that metric failed to meet + # the tolerance + results = pd.concat(results_dfs) + # get the records w/ errors in any! of their checks + errors = results[results.filter(like="is_error").any(axis=1)] + if not errors.empty: + # it miiight be good to isolate just the error columns.. + raise AssertionError( + f"Found errors while checking the make a cuter message:\n{errors}" + ) + # off_df is pretty specific to the one check that we're doing now, but is also # useful in creating the corrections later one. # Find a simpler way to check this particular metric, and defer the creation of @@ -1155,38 +1234,13 @@ def check_calculation_metrics( # - They require a uniform `reported_value` column so that they can all have the same # call signature, which allows us to iterate over all of them in a matrix. ######################################################################################## -class CalcTol(TransformParams): - """A simple class to contain an error tolerance specification.""" - - by: str | list[str] | None = None - name: str | None = None - tol: float - - @validator("name", always=True) - def name_by(cls, v, values): - """Set the name to be the same as by if name is none and by is string.""" - if isinstance(v, str): - return v - - if v is None and isinstance(values["by"], str): - return values["by"] - raise ValueError("Must give tolerance a name if by is a list.") - -class CalcCheck(TransformParams): +class GroupCheck(BaseModel): """Per metric calculation checks.""" - isclose_rtol: confloat(ge=0.0) = 1e-5 - """Relative tolerance to use in :func:`np.isclose` for determining equality.""" - - isclose_atol: confloat(ge=0.0, le=0.01) = 1e-8 - """Absolute tolerance to use in :func:`np.isclose` for determining equality. - - Since we are comparing financial values, and we want them to be equal only insofar - as they would be financially indistinguishable, they should never differ by more - than a penny, hence the upper bound of 0.01. - """ + by: str | list[str] + test_checks: TestChecks required_cols: list[str] = [ "table_name", @@ -1199,36 +1253,16 @@ class CalcCheck(TransformParams): "rel_diff", ] - tols: list[CalcTol] = [] - """List of tolrances to check. - - Not totally satisfied here. It would be nice to be able to refer to the specific - tolerances by name, since they're pretty well defined now, and we know that there - are some testable properties relating them to each other. - - At the same time it's nice to have this be a generic list of whatever groupby - elements you want to check, since there could be special cases for different - tables or groups of tables and it would be nice to avoid hard coding that into - the classes. - """ - - # @validator("utility_id_ferc1", "report_year", "table_name", "xbrl_factoid") - # def grouped_tol_ge_ungrouped_tol(v, values): - # """Grouped tolerance should always be greater than or equal to ungrouped.""" - # if v is not None: - # assert v >= values["ungrouped"] - # return v - def _is_valid_error_df(self: Self, df: pd.DataFrame): """Check that the input dataframe has all required columns.""" return all(col in df.columns for col in self.required_cols) @abstractmethod - def metric(self: Self, df: pd.DataFrame) -> float: - """The error metric to calculate for a dataframe or group.""" + def test_apply(self: Self, gb: DataFrameGroupBy) -> pd.Series: + """Apply method for each individual test.""" ... - def is_error(self: Self, df: pd.DataFrame) -> pd.Series: + def is_error(self, df: pd.DataFrame) -> pd.Series: """Flag those records that qualify as an error. NOTE: How should this categorization vary from metric to metric? Why is it @@ -1238,140 +1272,130 @@ def is_error(self: Self, df: pd.DataFrame) -> pd.Series: ~np.isclose( df["calculated_value"], df["reported_value"], - rtol=self.isclose_rtol, - atol=self.isclose_atol, + rtol=self.test_checks.isclose_rtol, + atol=self.test_checks.isclose_atol, ) & df["abs_diff"].notnull() ) - def report(self: Self, df: pd.DataFrame) -> dict[str, pd.Series]: - """Report the full distribution of errors by group.""" - assert self._is_valid_error_df(df) - df.loc[:, "is_error"] = self.is_error(df) - error_report = {} - for tol in self.tols: - if tol.by == "ungrouped": - errors = pd.Series(self.metric(df)) - else: - errors = df.groupby(by=tol.by).apply(self.metric) - error_report[tol.name] = errors - return error_report - - def check(self: Self, df: pd.DataFrame) -> bool: - """Check metric for all groups and their tolerances.""" - error_report = self.report(df) - message = "" - error_message = "" - valid = True - for tol in self.tols: - max_error = error_report[tol.name].max() - msg = ( - f"{tol.name} max error: {max_error:0.6f}; " - f"tolerance: {tol.tol:0.6f}\n" - ) - if max_error > tol.tol: - valid = False - error_message += msg - else: - message += msg - logger.info(message) - if not valid: - raise AssertionError(message) - - -class ErrorFrequency(CalcCheck): + def metric(self: Self, df: pd.DataFrame) -> pd.Series: + """Calculate the frequency with which records are tagged as errors.""" + df["is_error"] = self.is_error(df) + return df.groupby(self.by).apply(self.test_apply) + + # def report(self: Self, df: pd.DataFrame) -> dict[str, pd.Series]: + # """Report the full distribution of errors by group.""" + # assert self._is_valid_error_df(df) + # df.loc[:, "is_error"] = self.is_error(df) + # error_report = {} + # for tol in self.tols: + # if tol.by == "ungrouped": + # errors = pd.Series(self.metric(df)) + # else: + # errors = df.groupby(by=tol.by).apply(self.metric) + # error_report[tol.name] = errors + # return error_report + + # def check(self: Self, df: pd.DataFrame) -> bool: + # """Check metric for all groups and their tolerances.""" + # error_report = self.report(df) + # message = "" + # error_message = "" + # valid = True + # for tol in self.tols: + # max_error = error_report[tol.name].max() + # msg = ( + # f"{tol.name} max error: {max_error:0.6f}; " + # f"tolerance: {tol.tol:0.6f}\n" + # ) + # if max_error > tol.tol: + # valid = False + # error_message += msg + # else: + # message += msg + # logger.info(message) + # if not valid: + # raise AssertionError(message) + + +class ErrorFrequency(GroupCheck): """Check error frequency in XBRL calculations.""" - tols: list[CalcTol] = [ - CalcTol(by="ungrouped", tol=0.01), - CalcTol(by="table_name", tol=0.1), - CalcTol(by="xbrl_factoid", tol=0.1), - CalcTol(by="utility_id_ferc1", tol=0.1), - CalcTol(by="report_year", tol=0.1), - ] - - def metric(self: Self, df: pd.DataFrame) -> float: + def test_apply(self: Self, gb: DataFrameGroupBy) -> pd.Series: """Calculate the frequency with which records are tagged as errors.""" try: - return df[df.is_error].shape[0] / df.shape[0] + out = gb[gb.is_error].shape[0] / gb.shape[0] except ZeroDivisionError: # Will only occur if all reported values are NaN when calculated values # exist, or vice versa. logger.warning( "Calculated values have no corresponding reported values in this table." ) - return np.nan + out = np.nan + return out -class RelativeErrorMagnitude(CalcCheck): +class RelativeErrorMagnitude(GroupCheck): """Check relative magnitude of errors in XBRL calculations.""" - tols: list[CalcTol] = [ - CalcTol(by="ungrouped", tol=0.001), - CalcTol(by="table_name", tol=0.01), - CalcTol(by="xbrl_factoid", tol=0.01), - CalcTol(by="utility_id_ferc1", tol=0.01), - CalcTol(by="report_year", tol=0.01), - ] - - def metric(self: Self, df: pd.DataFrame) -> float: + def test_apply(self: Self, gb: DataFrameGroupBy) -> float: """Calculate the mangnitude of the errors relative to total reported value.""" try: # Should we be taking the absolute value of the reported column? - return df[df.is_error].abs_diff.sum() / df["reported_value"].abs().sum() + return gb[gb.is_error].abs_diff.sum() / gb["reported_value"].abs().sum() except ZeroDivisionError: return np.nan -class AbsoluteErrorMagnitude(CalcCheck): - """Check relative magnitude of errors in XBRL calculations. +# class AbsoluteErrorMagnitude(GroupCheck): +# """Check relative magnitude of errors in XBRL calculations. - These numbers may vary wildly from table to table so no default values for the - expected errors are provided here... - """ +# These numbers may vary wildly from table to table so no default values for the +# expected errors are provided here... +# """ - def metric(self: Self, df: pd.DataFrame) -> float: - """Calculate the absolute mangnitude of XBRL calculation errors.""" - return df.abs_diff.sum() +# def metric(self: Self, df: pd.DataFrame) -> float: +# """Calculate the absolute mangnitude of XBRL calculation errors.""" +# return df.abs_diff.sum() -class NullCalculationFrequency(CalcCheck): - """Check the frequency of null calculated values.""" +# class NullCalculationFrequency(GroupCheck): +# """Check the frequency of null calculated values.""" - tols: list[CalcTol] = [ - CalcTol(by="ungrouped", tol=1.0), - CalcTol(by="table_name", tol=1.0), - CalcTol(by="xbrl_factoid", tol=1.0), - CalcTol(by="utility_id_ferc1", tol=1.0), - CalcTol(by="report_year", tol=1.0), - ] +# tols: list[CalcTol] = [ +# CalcTol(by="ungrouped", tol=1.0), +# CalcTol(by="table_name", tol=1.0), +# CalcTol(by="xbrl_factoid", tol=1.0), +# CalcTol(by="utility_id_ferc1", tol=1.0), +# CalcTol(by="report_year", tol=1.0), +# ] - def metric(self: Self, df: pd.DataFrame) -> float: - """Frequency of null calculated values when reported values are not.""" - non_null_reported = df["reported_value"].notnull() - null_calculated = df["calculated_value"].isnull() - try: - return (non_null_reported & null_calculated).sum() / non_null_reported.sum() - except ZeroDivisionError: - return np.nan +# def metric(self: Self, df: pd.DataFrame) -> float: +# """Frequency of null calculated values when reported values are not.""" +# non_null_reported = df["reported_value"].notnull() +# null_calculated = df["calculated_value"].isnull() +# try: +# return (non_null_reported & null_calculated).sum() / non_null_reported.sum() +# except ZeroDivisionError: +# return np.nan -class NullReportedValueFrequency(CalcCheck): - """Check the frequency of null reported values.""" +# class NullReportedValueFrequency(GroupCheck): +# """Check the frequency of null reported values.""" - tols: list[CalcTol] = [ - CalcTol(by="ungrouped", tol=0.50), - ] +# tols: list[CalcTol] = [ +# CalcTol(by="ungrouped", tol=0.50), +# ] - def metric(self: Self, df: pd.DataFrame) -> float: - """Frequency with which the reported values are Null.""" - return df["reported_value"].isnull().sum() / df.shape[0] +# def metric(self: Self, df: pd.DataFrame) -> float: +# """Frequency with which the reported values are Null.""" +# return df["reported_value"].isnull().sum() / df.shape[0] def add_corrections( calculated_df: pd.DataFrame, value_col: str, - calculation_tolerance: CalculationTolerance, + calculation_tolerance: CalculationChecks, table_name: str, ) -> pd.DataFrame: """Add corrections to discrepancies between reported & calculated values. @@ -6272,8 +6296,12 @@ def make_calculation_dimensions_explicit( how="left", ) calc_comps_w_explicit_dims = calc_comps_w_dims[~null_dim_mask] + # astypes dealing w/ future warning regarding empty or all null dfs calc_comps_w_dims = pd.concat( - [calc_comps_w_implied_dims, calc_comps_w_explicit_dims] + [ + calc_comps_w_implied_dims.astype(calc_comps_w_explicit_dims.dtypes), + calc_comps_w_explicit_dims.astype(calc_comps_w_implied_dims.dtypes), + ] ) return calc_comps_w_dims @@ -6322,8 +6350,12 @@ def assign_parent_dimensions( right_on=parent_dim_idx, how="left", ) + # astypes dealing w/ future warning regarding empty or all null dfs calc_components = pd.concat( - [calc_components_null, calc_components_non_null] + [ + calc_components_null.astype(calc_components_non_null.dtypes), + calc_components_non_null.astype(calc_components_null.dtypes), + ] ).reset_index(drop=True) return calc_components diff --git a/src/pudl/transform/params/ferc1.py b/src/pudl/transform/params/ferc1.py index 0b50f352eb..d331918e0f 100644 --- a/src/pudl/transform/params/ferc1.py +++ b/src/pudl/transform/params/ferc1.py @@ -2729,8 +2729,8 @@ # Known issue with reporting of construction in progress not classified in classified fields of table. "reconcile_table_calculations": { "column_to_check": "ending_balance", - "calculation_tolerance": { - "bulk_error_frequency": 0.08, + "calculation_checks": { + "group_checks": {"ungrouped": {"error_frequency": 0.08}}, }, }, }, @@ -3717,8 +3717,8 @@ "strip_non_numeric_values": {"amount": {"strip_non_numeric_values": True}}, "reconcile_table_calculations": { "column_to_check": "ending_balance", - "calculation_tolerance": { - "bulk_error_frequency": 0.08, + "calculation_checks": { + "group_checks": {"ungrouped": {"error_frequency": 0.08}}, }, }, }, @@ -3986,8 +3986,8 @@ "column_to_check": "dollar_value", # Note: this table does not currently get exploded. It will require # additional debugging at a later date. - "calculation_tolerance": { - "bulk_error_frequency": 0.4, + "calculation_checks": { + "group_checks": {"ungrouped": {"error_frequency": 0.4}}, }, }, }, From 5528e47c375253a144905e277d0a07202e8a143c Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 18 Oct 2023 10:44:25 -0400 Subject: [PATCH 19/37] still vv wip but moving things around a little for accesibility --- src/pudl/transform/ferc1.py | 39 ++++++++++++++++--------------- test/unit/transform/ferc1_test.py | 4 ++-- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 53170275cd..b2b4c3baf6 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -941,9 +941,7 @@ def reconcile_table_calculations( value_col=params.column_to_check, ).pipe( check_calculation_metrics, - value_col=params.column_to_check, calculation_checks=params.calculation_checks, - table_name=table_name, ) # .pipe( # add_corrections, @@ -999,9 +997,7 @@ def _check_subtotal_calculations( ) subtotal_calcs = check_calculation_metrics( calculated_df=subtotal_calcs, - value_col=params.column_to_check, calculation_checks=params.calculation_checks, - table_name=table_name, ) @@ -1123,17 +1119,6 @@ def calculate_values_from_components( calculated_df = calculated_df.drop(columns=["_merge"]) # Force value_col to be a float to prevent any hijinks with calculating differences. - calculated_df[value_col] = calculated_df[value_col].astype(float) - return calculated_df - - -def check_calculation_metrics( - calculated_df: pd.DataFrame, - value_col: str, - calculation_checks: CalculationChecks, - table_name: str, -) -> pd.DataFrame: - """Run the calculation metrics and determine if calculations are within tolerance.""" # Data types were very messy here, including pandas Float64 for the # calculated_value columns which did not work with the np.isclose(). Not sure # why these are cropping up. @@ -1150,19 +1135,25 @@ def check_calculation_metrics( ) # Uniformity here helps keep the error checking functions simpler: calculated_df["reported_value"] = calculated_df[value_col] + return calculated_df - # DO ERROR CHECKS + +def calculation_check_results_by_group( + calculated_df: pd.DataFrame, + calculation_checks: CalculationChecks, +) -> pd.DataFrame: + """Tabulate the results of the calculation checks by group.""" results_dfs = {} # for each groupby grouping: calculate metrics for each test # then check if each test is within acceptable tolerance levels for by, group_tols in calculation_checks.group_checks: if by == "ungrouped": - calculated_df = calculated_df.assign(ungrouped=1) + calculated_df = calculated_df.assign(ungrouped="ungrouped") group_metrics = {} for test, tol in group_tols: logger.info(f"{by}: {test}") title_case_test = test.title().replace("_", "") - tester = locals()[title_case_test]( + tester = globals()[title_case_test]( by=by, test_checks=calculation_checks.test_checks.dict()[test] ) group_metrics[test] = tester.metric(df=calculated_df) @@ -1183,12 +1174,22 @@ def check_calculation_metrics( # group/test and a boolean indicating whether or not that metric failed to meet # the tolerance results = pd.concat(results_dfs) + return results + + +def check_calculation_metrics( + calculated_df: pd.DataFrame, + calculation_checks: CalculationChecks, +) -> pd.DataFrame: + """Run the calculation metrics and determine if calculations are within tolerance.""" + # DO ERROR CHECKS + results = calculation_check_results_by_group(calculated_df, calculation_checks) # get the records w/ errors in any! of their checks errors = results[results.filter(like="is_error").any(axis=1)] if not errors.empty: # it miiight be good to isolate just the error columns.. raise AssertionError( - f"Found errors while checking the make a cuter message:\n{errors}" + f"Found errors while running tests on the calculations:\n{errors}" ) # off_df is pretty specific to the one check that we're doing now, but is also diff --git a/test/unit/transform/ferc1_test.py b/test/unit/transform/ferc1_test.py index 01713177d8..9390ccc70f 100644 --- a/test/unit/transform/ferc1_test.py +++ b/test/unit/transform/ferc1_test.py @@ -447,13 +447,13 @@ def test_calculate_values_from_components(): books,big_fact,earth,12.0,44,2312,{3+4+5} """ ) - ) + ).convert_dtypes() out_ksr = calculate_values_from_components( calculation_components=calculation_components_ksr, data=data_ksr, calc_idx=["table_name", "xbrl_factoid", "planet"], value_col="value", - ) + )[list(expected_ksr.columns)].convert_dtypes() pd.testing.assert_frame_equal(expected_ksr, out_ksr) From 8c00594b00bae3fef51459e420a0427a1590ed5c Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 18 Oct 2023 15:11:17 -0400 Subject: [PATCH 20/37] clean up the results df & add absolute_error_magintude --- src/pudl/output/ferc1.py | 8 +- src/pudl/transform/ferc1.py | 231 +++++++++++++----------------------- 2 files changed, 85 insertions(+), 154 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 7eea29284d..f6a4ef6de9 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -18,7 +18,7 @@ from pudl.transform.ferc1 import ( CalculationChecks, CalculationGroupChecks, - GroupedCalculationTolerance, + CalculationMetricTolerance, ) logger = pudl.logging_helpers.get_logger(__name__) @@ -27,17 +27,17 @@ EXPLOSION_CALCULATION_TOLERANCES: dict[str, CalculationChecks] = { "income_statement_ferc1": CalculationChecks( group_checks=CalculationGroupChecks( - ungrouped=GroupedCalculationTolerance(error_frequency=0.20) + ungrouped=CalculationMetricTolerance(error_frequency=0.20) ) ), "balance_sheet_assets_ferc1": CalculationChecks( group_checks=CalculationGroupChecks( - ungrouped=GroupedCalculationTolerance(error_frequency=0.65) + ungrouped=CalculationMetricTolerance(error_frequency=0.65) ) ), "balance_sheet_liabilities_ferc1": CalculationChecks( group_checks=CalculationGroupChecks( - ungrouped=GroupedCalculationTolerance(error_frequency=0.07) + ungrouped=CalculationMetricTolerance(error_frequency=0.07) ) ), } diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index b2b4c3baf6..0679fcd3a3 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -727,26 +727,7 @@ def combine_axis_columns_xbrl( return df -class CalcTol(TransformParams): - """A simple class to contain an error tolerance specification.""" - - # TODO: Get rid of this probably? - - tol: float - - # @validator("name", always=True) - # def name_by(cls, v, values): - # """Set the name to be the same as by if name is none and by is string.""" - # if isinstance(v, str): - # return v - - # if v is None and isinstance(values["by"], str): - # return values["by"] - - # raise ValueError("Must give tolerance a name if by is a list.") - - -class TestChecks(TransformParams): +class MetricInputs(TransformParams): """Info for testing a particular check.""" isclose_rtol: confloat(ge=0.0) = 1e-5 @@ -756,18 +737,20 @@ class TestChecks(TransformParams): """Absolute tolerance to use in :func:`np.isclose` for determining equality.""" -class CalculationTestChecks(TransformParams): +class CalculationMetricInputs(TransformParams): """Calc params organized by check type.""" - error_frequency: TestChecks = TestChecks() - relative_error_magnitude: TestChecks = TestChecks(isclose_atol=1e-9) + error_frequency: MetricInputs = MetricInputs() + relative_error_magnitude: MetricInputs = MetricInputs(isclose_atol=1e-9) + absolute_error_magnitude: MetricInputs = MetricInputs() -class GroupedCalculationTolerance(TransformParams): +class CalculationMetricTolerance(TransformParams): """Tolerances for all data checks to be preformed within a grouped df.""" error_frequency: confloat(ge=0.0, le=1.0) = 0.01 relative_error_magnitude: confloat(ge=0.0) = 0.001 + absolute_error_magnitude: confloat(ge=0.0) = np.inf class CalculationGroupChecks(TransformParams): @@ -796,15 +779,16 @@ class CalculationGroupChecks(TransformParams): want to know about it! """ - ungrouped: GroupedCalculationTolerance = GroupedCalculationTolerance( + # TODO: determine how to null a group so we don't have to do all the groups all the time + ungrouped: CalculationMetricTolerance = CalculationMetricTolerance( error_frequency=0.05, relative_error_magnitude=0.01 ) - xbrl_factoid: GroupedCalculationTolerance = GroupedCalculationTolerance() - utility_id_ferc1: GroupedCalculationTolerance = GroupedCalculationTolerance( + xbrl_factoid: CalculationMetricTolerance = CalculationMetricTolerance() + utility_id_ferc1: CalculationMetricTolerance = CalculationMetricTolerance( relative_error_magnitude=0.004 ) - report_year: GroupedCalculationTolerance = GroupedCalculationTolerance() - table_name: GroupedCalculationTolerance = GroupedCalculationTolerance() + report_year: CalculationMetricTolerance = CalculationMetricTolerance() + table_name: CalculationMetricTolerance = CalculationMetricTolerance() # bulk_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.05 # """Fraction of records with non-null reported values and null calculated values.""" @@ -829,7 +813,7 @@ class CalculationChecks(TransformParams): """Input for checking calculations organized by group and test.""" group_checks: CalculationGroupChecks = CalculationGroupChecks() - test_checks: CalculationTestChecks = CalculationTestChecks() + metric_intputs: CalculationMetricInputs = CalculationMetricInputs() class ReconcileTableCalculations(TransformParams): @@ -1142,38 +1126,35 @@ def calculation_check_results_by_group( calculated_df: pd.DataFrame, calculation_checks: CalculationChecks, ) -> pd.DataFrame: - """Tabulate the results of the calculation checks by group.""" + """Tabulate the results of the calculation checks by group. + + Convert all of the groups' checks into a big df. This will have two indexes: first + for the group name (group) and one for the groups values. the columns will include + three for each test: the test mertic that is the same name as the test (ex: + error_frequency), the tolerance for that group/test and a boolean indicating + whether or not that metric failed to meet the tolerance. + """ results_dfs = {} # for each groupby grouping: calculate metrics for each test # then check if each test is within acceptable tolerance levels - for by, group_tols in calculation_checks.group_checks: + for by, group_checks in calculation_checks.group_checks: if by == "ungrouped": calculated_df = calculated_df.assign(ungrouped="ungrouped") group_metrics = {} - for test, tol in group_tols: - logger.info(f"{by}: {test}") - title_case_test = test.title().replace("_", "") + for metric_name, tol in group_checks: + # this feels icky. the param name for the metrics are all snake_case while + # the metric classes are all TitleCase. So we convert to TitleCase + title_case_test = metric_name.title().replace("_", "") tester = globals()[title_case_test]( - by=by, test_checks=calculation_checks.test_checks.dict()[test] + by=by, + metric_intputs=calculation_checks.metric_intputs.dict()[metric_name], + metric_tolerance=calculation_checks.group_checks.dict()[by][ + metric_name + ], ) - group_metrics[test] = tester.metric(df=calculated_df) - group_check_df = pd.DataFrame(group_metrics) - - for test, tol in calculation_checks.group_checks.dict()[by].items(): - group_check_df = group_check_df.assign( - **{ - f"tolerance_{test}": tol, # tol is just for reporting so you can know of off you are - f"is_error_{test}": group_check_df[test] > tol, - } - ) - results_dfs[by] = group_check_df - # convert all of the grouped checks into a big df. this will have - # two unnamed indexes: first for the group name (by) and one for the - # groups values. the columns will include three for each test: the test mertic - # that is the same name as the test (ex: error_frequency), the tolerance for that - # group/test and a boolean indicating whether or not that metric failed to meet - # the tolerance - results = pd.concat(results_dfs) + group_metrics[metric_name] = tester.check(calculated_df=calculated_df) + results_dfs[by] = pd.concat(group_metrics.values(), axis=1) + results = pd.concat(results_dfs.values()) return results @@ -1191,40 +1172,6 @@ def check_calculation_metrics( raise AssertionError( f"Found errors while running tests on the calculations:\n{errors}" ) - - # off_df is pretty specific to the one check that we're doing now, but is also - # useful in creating the corrections later one. - # Find a simpler way to check this particular metric, and defer the creation of - # this metric until the new add_corrections() function. - - # Identify records where the calculated and reported values are not equal within - # some tolerance (defined here by the np.isclose defaults). Ignore records for - # which the absolute difference "abs_dfiff" is NA since that indicates one or the - # other of the calculated or reported values was NA. - # all_errors = calculated_df[calculated_df["is_error"]] - - # Why does it make sense to compare against records where abs_diff is non-null? - # Why wouldn't we want to look at records where calculated_value is non-null? - # non_null_calculated_df = calculated_df.dropna(subset="abs_diff") - # if non_null_calculated_df.empty: - # # Will only occur if all reported values are NaN when calculated values - # # exist, or vice versa. - # logger.warning( - # "Calculated values have no corresponding reported values in this table." - # ) - # bulk_error_frequency = np.nan - # else: - # bulk_error_frequency = len(all_errors) / len(non_null_calculated_df) - - # if bulk_error_frequency > calculation_tolerance.bulk_error_frequency: - # raise AssertionError( - # f"Bulk error rate of calculations in {table_name} is {bulk_error_frequency:.2%}. " - # f"Accepted tolerance is {calculation_tolerance.bulk_error_frequency:.1%}." - # ) - - # Given a particular set of groupby() columns, calculate the fraction of records in - # each group whose calculations do not match reported values based on np.isclose() - # E.g. within each `report_year`. return calculated_df @@ -1241,7 +1188,8 @@ class GroupCheck(BaseModel): """Per metric calculation checks.""" by: str | list[str] - test_checks: TestChecks + metric_intputs: MetricInputs + metric_tolerance: float required_cols: list[str] = [ "table_name", @@ -1259,11 +1207,11 @@ def _is_valid_error_df(self: Self, df: pd.DataFrame): return all(col in df.columns for col in self.required_cols) @abstractmethod - def test_apply(self: Self, gb: DataFrameGroupBy) -> pd.Series: + def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: """Apply method for each individual test.""" ... - def is_error(self, df: pd.DataFrame) -> pd.Series: + def is_not_close(self, df: pd.DataFrame) -> pd.Series: """Flag those records that qualify as an error. NOTE: How should this categorization vary from metric to metric? Why is it @@ -1273,59 +1221,50 @@ def is_error(self, df: pd.DataFrame) -> pd.Series: ~np.isclose( df["calculated_value"], df["reported_value"], - rtol=self.test_checks.isclose_rtol, - atol=self.test_checks.isclose_atol, + rtol=self.metric_intputs.isclose_rtol, + atol=self.metric_intputs.isclose_atol, ) & df["abs_diff"].notnull() ) - def metric(self: Self, df: pd.DataFrame) -> pd.Series: + def calculate_metric(self: Self, df: pd.DataFrame) -> pd.Series: """Calculate the frequency with which records are tagged as errors.""" - df["is_error"] = self.is_error(df) - return df.groupby(self.by).apply(self.test_apply) - - # def report(self: Self, df: pd.DataFrame) -> dict[str, pd.Series]: - # """Report the full distribution of errors by group.""" - # assert self._is_valid_error_df(df) - # df.loc[:, "is_error"] = self.is_error(df) - # error_report = {} - # for tol in self.tols: - # if tol.by == "ungrouped": - # errors = pd.Series(self.metric(df)) - # else: - # errors = df.groupby(by=tol.by).apply(self.metric) - # error_report[tol.name] = errors - # return error_report - - # def check(self: Self, df: pd.DataFrame) -> bool: - # """Check metric for all groups and their tolerances.""" - # error_report = self.report(df) - # message = "" - # error_message = "" - # valid = True - # for tol in self.tols: - # max_error = error_report[tol.name].max() - # msg = ( - # f"{tol.name} max error: {max_error:0.6f}; " - # f"tolerance: {tol.tol:0.6f}\n" - # ) - # if max_error > tol.tol: - # valid = False - # error_message += msg - # else: - # message += msg - # logger.info(message) - # if not valid: - # raise AssertionError(message) + df["is_not_close"] = self.is_not_close(df) + return df.groupby(self.by).apply(self.metric) + + def snake_case_metric_name(self: Self) -> str: + """Convert the TitleCase class name to a snake_case string.""" + class_name = self.__class__.__name__ + return re.sub("(?!^)([A-Z]+)", r"_\1", class_name).lower() + + def check(self: Self, calculated_df) -> pd.DataFrame: + """Make a df w/ the metric, tolerance and is_error columns.""" + metric_name = self.snake_case_metric_name() + df = ( + pd.DataFrame(self.calculate_metric(calculated_df), columns=[metric_name]) + .assign( + **{ + f"tolerance_{metric_name}": self.metric_tolerance, # tol is just for reporting so you can know of off you are + f"is_error_{metric_name}": lambda x: x[metric_name] + > self.metric_tolerance, + } + ) + # make a uniform multi-index w/ group name and group values + .assign(group=self.by) + .reset_index() + .rename(columns={self.by: "group_value"}) + .set_index(["group", "group_value"]) + ) + return df class ErrorFrequency(GroupCheck): """Check error frequency in XBRL calculations.""" - def test_apply(self: Self, gb: DataFrameGroupBy) -> pd.Series: + def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: """Calculate the frequency with which records are tagged as errors.""" try: - out = gb[gb.is_error].shape[0] / gb.shape[0] + out = gb[gb.is_not_close].shape[0] / gb.shape[0] except ZeroDivisionError: # Will only occur if all reported values are NaN when calculated values # exist, or vice versa. @@ -1339,38 +1278,30 @@ def test_apply(self: Self, gb: DataFrameGroupBy) -> pd.Series: class RelativeErrorMagnitude(GroupCheck): """Check relative magnitude of errors in XBRL calculations.""" - def test_apply(self: Self, gb: DataFrameGroupBy) -> float: + def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: """Calculate the mangnitude of the errors relative to total reported value.""" try: # Should we be taking the absolute value of the reported column? - return gb[gb.is_error].abs_diff.sum() / gb["reported_value"].abs().sum() + return gb[gb.is_not_close].abs_diff.sum() / gb["reported_value"].abs().sum() except ZeroDivisionError: return np.nan -# class AbsoluteErrorMagnitude(GroupCheck): -# """Check relative magnitude of errors in XBRL calculations. +class AbsoluteErrorMagnitude(GroupCheck): + """Check relative magnitude of errors in XBRL calculations. -# These numbers may vary wildly from table to table so no default values for the -# expected errors are provided here... -# """ + These numbers may vary wildly from table to table so no default values for the + expected errors are provided here... + """ -# def metric(self: Self, df: pd.DataFrame) -> float: -# """Calculate the absolute mangnitude of XBRL calculation errors.""" -# return df.abs_diff.sum() + def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: + """Calculate the absolute mangnitude of XBRL calculation errors.""" + return gb.abs_diff.sum() # class NullCalculationFrequency(GroupCheck): # """Check the frequency of null calculated values.""" -# tols: list[CalcTol] = [ -# CalcTol(by="ungrouped", tol=1.0), -# CalcTol(by="table_name", tol=1.0), -# CalcTol(by="xbrl_factoid", tol=1.0), -# CalcTol(by="utility_id_ferc1", tol=1.0), -# CalcTol(by="report_year", tol=1.0), -# ] - # def metric(self: Self, df: pd.DataFrame) -> float: # """Frequency of null calculated values when reported values are not.""" # non_null_reported = df["reported_value"].notnull() From 8c7864d88a830e7b23bd77cb8e04f555d55863a5 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 18 Oct 2023 15:30:22 -0400 Subject: [PATCH 21/37] add null_calculation_frequency metric --- src/pudl/transform/ferc1.py | 42 ++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 0679fcd3a3..6cded7e596 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -743,6 +743,7 @@ class CalculationMetricInputs(TransformParams): error_frequency: MetricInputs = MetricInputs() relative_error_magnitude: MetricInputs = MetricInputs(isclose_atol=1e-9) absolute_error_magnitude: MetricInputs = MetricInputs() + null_calculation_frequency: MetricInputs = MetricInputs() class CalculationMetricTolerance(TransformParams): @@ -751,6 +752,8 @@ class CalculationMetricTolerance(TransformParams): error_frequency: confloat(ge=0.0, le=1.0) = 0.01 relative_error_magnitude: confloat(ge=0.0) = 0.001 absolute_error_magnitude: confloat(ge=0.0) = np.inf + null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.7 + """Fraction of records with non-null reported values and null calculated values.""" class CalculationGroupChecks(TransformParams): @@ -781,27 +784,18 @@ class CalculationGroupChecks(TransformParams): # TODO: determine how to null a group so we don't have to do all the groups all the time ungrouped: CalculationMetricTolerance = CalculationMetricTolerance( - error_frequency=0.05, relative_error_magnitude=0.01 + error_frequency=0.05, + relative_error_magnitude=0.01, + null_calculation_frequency=0.50, ) xbrl_factoid: CalculationMetricTolerance = CalculationMetricTolerance() utility_id_ferc1: CalculationMetricTolerance = CalculationMetricTolerance( - relative_error_magnitude=0.004 + relative_error_magnitude=0.004, null_calculation_frequency=0.8 ) report_year: CalculationMetricTolerance = CalculationMetricTolerance() table_name: CalculationMetricTolerance = CalculationMetricTolerance() - # bulk_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.05 - # """Fraction of records with non-null reported values and null calculated values.""" - - # utility_id_ferc1_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 - # report_year_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 - # xbrl_factoid_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 - # table_name_null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.1 - - # bulk_null_reported_value_frequency: confloat(ge=0.0, le=1.0) = 0.50 - # """Fraction of records with non-null reported values and null calculated values.""" - - # @validator("utility_id_ferc1", "report_year", "table_name", "xbrl_factoid") + # @validator("ungrouped", "utility_id_ferc1", "report_year", "table_name", "xbrl_factoid") # def grouped_tol_ge_ungrouped_tol(v, values): # """Grouped tolerance should always be greater than or equal to ungrouped.""" # if v is not None: @@ -1299,17 +1293,17 @@ def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: return gb.abs_diff.sum() -# class NullCalculationFrequency(GroupCheck): -# """Check the frequency of null calculated values.""" +class NullCalculationFrequency(GroupCheck): + """Check the frequency of null calculated values.""" -# def metric(self: Self, df: pd.DataFrame) -> float: -# """Frequency of null calculated values when reported values are not.""" -# non_null_reported = df["reported_value"].notnull() -# null_calculated = df["calculated_value"].isnull() -# try: -# return (non_null_reported & null_calculated).sum() / non_null_reported.sum() -# except ZeroDivisionError: -# return np.nan + def metric(self: Self, gb: pd.DataFrame) -> pd.Series: + """Frequency of null calculated values when reported values are not.""" + non_null_reported = gb["reported_value"].notnull() + null_calculated = gb["calculated_value"].isnull() + try: + return (non_null_reported & null_calculated).sum() / non_null_reported.sum() + except ZeroDivisionError: + return np.nan # class NullReportedValueFrequency(GroupCheck): From 548a9d99fc3793c79c2b49e82a2d57a25a603641 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 18 Oct 2023 17:32:34 -0400 Subject: [PATCH 22/37] cleanup and enable turning on and off groups and metrics --- src/pudl/transform/ferc1.py | 98 ++++++++++++++++++++++--------------- 1 file changed, 59 insertions(+), 39 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 6cded7e596..343e7e5b6c 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -744,6 +744,7 @@ class CalculationMetricInputs(TransformParams): relative_error_magnitude: MetricInputs = MetricInputs(isclose_atol=1e-9) absolute_error_magnitude: MetricInputs = MetricInputs() null_calculation_frequency: MetricInputs = MetricInputs() + null_reported_value_frequency: MetricInputs = MetricInputs() class CalculationMetricTolerance(TransformParams): @@ -752,8 +753,9 @@ class CalculationMetricTolerance(TransformParams): error_frequency: confloat(ge=0.0, le=1.0) = 0.01 relative_error_magnitude: confloat(ge=0.0) = 0.001 absolute_error_magnitude: confloat(ge=0.0) = np.inf - null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.7 + null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.02 """Fraction of records with non-null reported values and null calculated values.""" + null_reported_value_frequency: confloat(ge=0.0, le=1.0) = 1.0 class CalculationGroupChecks(TransformParams): @@ -790,7 +792,7 @@ class CalculationGroupChecks(TransformParams): ) xbrl_factoid: CalculationMetricTolerance = CalculationMetricTolerance() utility_id_ferc1: CalculationMetricTolerance = CalculationMetricTolerance( - relative_error_magnitude=0.004, null_calculation_frequency=0.8 + relative_error_magnitude=0.004, null_calculation_frequency=0.6 ) report_year: CalculationMetricTolerance = CalculationMetricTolerance() table_name: CalculationMetricTolerance = CalculationMetricTolerance() @@ -806,8 +808,19 @@ class CalculationGroupChecks(TransformParams): class CalculationChecks(TransformParams): """Input for checking calculations organized by group and test.""" + groups_to_check: list[str] = [ + "ungrouped", + "xbrl_factoid", + "utility_id_ferc1", + ] + metrics_to_check: list[str] = [ + "error_frequency", + "relative_error_magnitude", + "null_calculation_frequency", + "null_reported_value_frequency", + ] group_checks: CalculationGroupChecks = CalculationGroupChecks() - metric_intputs: CalculationMetricInputs = CalculationMetricInputs() + metric_inputs: CalculationMetricInputs = CalculationMetricInputs() class ReconcileTableCalculations(TransformParams): @@ -1131,23 +1144,26 @@ def calculation_check_results_by_group( results_dfs = {} # for each groupby grouping: calculate metrics for each test # then check if each test is within acceptable tolerance levels - for by, group_checks in calculation_checks.group_checks: - if by == "ungrouped": + for group_name in calculation_checks.groups_to_check: + if group_name == "ungrouped": calculated_df = calculated_df.assign(ungrouped="ungrouped") group_metrics = {} - for metric_name, tol in group_checks: - # this feels icky. the param name for the metrics are all snake_case while - # the metric classes are all TitleCase. So we convert to TitleCase - title_case_test = metric_name.title().replace("_", "") - tester = globals()[title_case_test]( - by=by, - metric_intputs=calculation_checks.metric_intputs.dict()[metric_name], - metric_tolerance=calculation_checks.group_checks.dict()[by][ - metric_name - ], - ) - group_metrics[metric_name] = tester.check(calculated_df=calculated_df) - results_dfs[by] = pd.concat(group_metrics.values(), axis=1) + for metric_name, metric_tolerance in calculation_checks.group_checks.dict()[ + group_name + ].items(): + # only check the metric if it was turned on in the param. + # TODO: move this logic into the param class? + if metric_name in calculation_checks.metrics_to_check: + # this feels icky. the param name for the metrics are all snake_case while + # the metric classes are all TitleCase. So we convert to TitleCase + title_case_test = metric_name.title().replace("_", "") + tester = globals()[title_case_test]( + by=group_name, + metric_inputs=calculation_checks.metric_inputs.dict()[metric_name], + metric_tolerance=metric_tolerance, + ) + group_metrics[metric_name] = tester.check(calculated_df=calculated_df) + results_dfs[group_name] = pd.concat(group_metrics.values(), axis=1) results = pd.concat(results_dfs.values()) return results @@ -1178,11 +1194,11 @@ def check_calculation_metrics( ######################################################################################## -class GroupCheck(BaseModel): - """Per metric calculation checks.""" +class GroupMetricCheck(BaseModel): + """Base class for checking a particular metric for a particular group.""" - by: str | list[str] - metric_intputs: MetricInputs + by: str # right now this only works with one column! bc of index rename in check() + metric_inputs: MetricInputs metric_tolerance: float required_cols: list[str] = [ @@ -1215,8 +1231,8 @@ def is_not_close(self, df: pd.DataFrame) -> pd.Series: ~np.isclose( df["calculated_value"], df["reported_value"], - rtol=self.metric_intputs.isclose_rtol, - atol=self.metric_intputs.isclose_atol, + rtol=self.metric_inputs.isclose_rtol, + atol=self.metric_inputs.isclose_atol, ) & df["abs_diff"].notnull() ) @@ -1238,7 +1254,7 @@ def check(self: Self, calculated_df) -> pd.DataFrame: pd.DataFrame(self.calculate_metric(calculated_df), columns=[metric_name]) .assign( **{ - f"tolerance_{metric_name}": self.metric_tolerance, # tol is just for reporting so you can know of off you are + # f"tolerance_{metric_name}": self.metric_tolerance, # tol is just for reporting so you can know of off you are f"is_error_{metric_name}": lambda x: x[metric_name] > self.metric_tolerance, } @@ -1252,7 +1268,7 @@ def check(self: Self, calculated_df) -> pd.DataFrame: return df -class ErrorFrequency(GroupCheck): +class ErrorFrequency(GroupMetricCheck): """Check error frequency in XBRL calculations.""" def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: @@ -1269,7 +1285,7 @@ def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: return out -class RelativeErrorMagnitude(GroupCheck): +class RelativeErrorMagnitude(GroupMetricCheck): """Check relative magnitude of errors in XBRL calculations.""" def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: @@ -1281,7 +1297,7 @@ def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: return np.nan -class AbsoluteErrorMagnitude(GroupCheck): +class AbsoluteErrorMagnitude(GroupMetricCheck): """Check relative magnitude of errors in XBRL calculations. These numbers may vary wildly from table to table so no default values for the @@ -1293,10 +1309,18 @@ def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: return gb.abs_diff.sum() -class NullCalculationFrequency(GroupCheck): +class NullCalculationFrequency(GroupMetricCheck): """Check the frequency of null calculated values.""" - def metric(self: Self, gb: pd.DataFrame) -> pd.Series: + def calculate_metric(self: Self, df: pd.DataFrame) -> pd.Series: + """Calculate the frequency with which records are tagged as errors.""" + return ( + df[(df.row_type_xbrl == "calculated_value") & (df.is_within_table_calc)] + .groupby(self.by) + .apply(self.metric) + ) + + def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: """Frequency of null calculated values when reported values are not.""" non_null_reported = gb["reported_value"].notnull() null_calculated = gb["calculated_value"].isnull() @@ -1306,16 +1330,12 @@ def metric(self: Self, gb: pd.DataFrame) -> pd.Series: return np.nan -# class NullReportedValueFrequency(GroupCheck): -# """Check the frequency of null reported values.""" - -# tols: list[CalcTol] = [ -# CalcTol(by="ungrouped", tol=0.50), -# ] +class NullReportedValueFrequency(GroupMetricCheck): + """Check the frequency of null reported values.""" -# def metric(self: Self, df: pd.DataFrame) -> float: -# """Frequency with which the reported values are Null.""" -# return df["reported_value"].isnull().sum() / df.shape[0] + def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: + """Frequency with which the reported values are Null.""" + return gb["reported_value"].isnull().sum() / gb.shape[0] def add_corrections( From 14b26b45e49fba76fd4cbe96632d21201b74b0df Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Tue, 24 Oct 2023 12:59:45 -0400 Subject: [PATCH 23/37] make transformers all work w/ new calc checking --- src/pudl/transform/ferc1.py | 165 +++++++++++++++++++++-------- src/pudl/transform/params/ferc1.py | 90 +++++++++++++++- test/unit/transform/ferc1_test.py | 3 +- 3 files changed, 206 insertions(+), 52 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 343e7e5b6c..ef30debd86 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -751,11 +751,12 @@ class CalculationMetricTolerance(TransformParams): """Tolerances for all data checks to be preformed within a grouped df.""" error_frequency: confloat(ge=0.0, le=1.0) = 0.01 - relative_error_magnitude: confloat(ge=0.0) = 0.001 + relative_error_magnitude: confloat(ge=0.0) = 0.04 absolute_error_magnitude: confloat(ge=0.0) = np.inf - null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.02 + null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.7 """Fraction of records with non-null reported values and null calculated values.""" null_reported_value_frequency: confloat(ge=0.0, le=1.0) = 1.0 + # ooof this one is just bad class CalculationGroupChecks(TransformParams): @@ -784,18 +785,30 @@ class CalculationGroupChecks(TransformParams): want to know about it! """ - # TODO: determine how to null a group so we don't have to do all the groups all the time ungrouped: CalculationMetricTolerance = CalculationMetricTolerance( - error_frequency=0.05, - relative_error_magnitude=0.01, + error_frequency=0.0005, + relative_error_magnitude=0.001, null_calculation_frequency=0.50, + null_reported_value_frequency=0.68, + ) + xbrl_factoid: CalculationMetricTolerance = CalculationMetricTolerance( + error_frequency=0.018, + relative_error_magnitude=0.0086, + null_calculation_frequency=1.0, ) - xbrl_factoid: CalculationMetricTolerance = CalculationMetricTolerance() utility_id_ferc1: CalculationMetricTolerance = CalculationMetricTolerance( - relative_error_magnitude=0.004, null_calculation_frequency=0.6 + error_frequency=0.038, + null_calculation_frequency=1.0, + ) + report_year: CalculationMetricTolerance = CalculationMetricTolerance( + error_frequency=0.006 + ) + table_name: CalculationMetricTolerance = CalculationMetricTolerance( + error_frequency=0.0005, + relative_error_magnitude=0.001, + null_calculation_frequency=0.50, + null_reported_value_frequency=0.68, ) - report_year: CalculationMetricTolerance = CalculationMetricTolerance() - table_name: CalculationMetricTolerance = CalculationMetricTolerance() # @validator("ungrouped", "utility_id_ferc1", "report_year", "table_name", "xbrl_factoid") # def grouped_tol_ge_ungrouped_tol(v, values): @@ -810,6 +823,7 @@ class CalculationChecks(TransformParams): groups_to_check: list[str] = [ "ungrouped", + "report_year", "xbrl_factoid", "utility_id_ferc1", ] @@ -930,18 +944,19 @@ def reconcile_table_calculations( calculation_components=intra_table_calcs, calc_idx=calc_idx, value_col=params.column_to_check, - ).pipe( + ) + .pipe( check_calculation_metrics, calculation_checks=params.calculation_checks, ) - # .pipe( - # add_corrections, - # value_col=params.column_to_check, - # calculation_checks=params.calculation_checks, - # table_name=table_name, - # ) + .pipe( + add_corrections, + value_col=params.column_to_check, + calculation_checks=MetricInputs(), + table_name=table_name, + ) # Rename back to the original xbrl_factoid column name before returning: - # .rename(columns={"xbrl_factoid": xbrl_factoid_name}) + .rename(columns={"xbrl_factoid": xbrl_factoid_name}) ) return calculated_df @@ -1145,14 +1160,11 @@ def calculation_check_results_by_group( # for each groupby grouping: calculate metrics for each test # then check if each test is within acceptable tolerance levels for group_name in calculation_checks.groups_to_check: - if group_name == "ungrouped": - calculated_df = calculated_df.assign(ungrouped="ungrouped") + logger.info(group_name) group_metrics = {} for metric_name, metric_tolerance in calculation_checks.group_checks.dict()[ group_name ].items(): - # only check the metric if it was turned on in the param. - # TODO: move this logic into the param class? if metric_name in calculation_checks.metrics_to_check: # this feels icky. the param name for the metrics are all snake_case while # the metric classes are all TitleCase. So we convert to TitleCase @@ -1195,12 +1207,24 @@ def check_calculation_metrics( class GroupMetricCheck(BaseModel): - """Base class for checking a particular metric for a particular group.""" - - by: str # right now this only works with one column! bc of index rename in check() + """Base class for checking a particular metric within a group.""" + + by: Literal[ + "ungrouped", "table_name", "xbrl_factoid", "utility_id_ferc1", "report_year" + ] # right now this only works with one column! bc of index rename in check() + """Name of group to check the metric based on. + + Most groups are column names to use in groupby along with the ``table_name``. Both + ``ungrouped`` and ``table_name`` have some special exceptions in order to output the + metric results with the same index of ``table_name`` and ``group_value``. For the + ``ungrouped`` group, the values in ``table_name`` and ``group_value`` will be + ``ungrouped``. For the ``table_name`` group, the values in ``table_name`` and + ``group_value`` will be the table name values. + """ metric_inputs: MetricInputs + """Inputs for the metric to determine :meth:`is_not_close`. Instance of :class:`MetricInputs`.""" metric_tolerance: float - + """Tolerance for checking the metric within the ``by`` group.""" required_cols: list[str] = [ "table_name", "xbrl_factoid", @@ -1212,9 +1236,15 @@ class GroupMetricCheck(BaseModel): "rel_diff", ] - def _is_valid_error_df(self: Self, df: pd.DataFrame): + def check_required_cols_df(self: Self, df: pd.DataFrame): """Check that the input dataframe has all required columns.""" - return all(col in df.columns for col in self.required_cols) + missing_required_cols = [ + col for col in self.required_cols if col not in df.columns + ] + if missing_required_cols: + raise AssertionError( + f"The table is missing the following required columns: {missing_required_cols}" + ) @abstractmethod def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: @@ -1237,10 +1267,21 @@ def is_not_close(self, df: pd.DataFrame) -> pd.Series: & df["abs_diff"].notnull() ) + def groupby_by(self: Self) -> list[str]: + """The list of columns to group by. + + We want to default to adding the table_name into all groupby's, but two of our + ``by`` options need special treatment. + """ + gb_by = ["table_name", self.by] + if self.by in ["ungrouped", "table_name"]: + gb_by = [self.by] + return gb_by + def calculate_metric(self: Self, df: pd.DataFrame) -> pd.Series: """Calculate the frequency with which records are tagged as errors.""" df["is_not_close"] = self.is_not_close(df) - return df.groupby(self.by).apply(self.metric) + return df.groupby(by=self.groupby_by()).apply(self.metric) def snake_case_metric_name(self: Self) -> str: """Convert the TitleCase class name to a snake_case string.""" @@ -1249,12 +1290,20 @@ def snake_case_metric_name(self: Self) -> str: def check(self: Self, calculated_df) -> pd.DataFrame: """Make a df w/ the metric, tolerance and is_error columns.""" + self.check_required_cols_df(calculated_df) + # ungrouped is special because the rest of the stock group names are column + # names. + if self.by == "ungrouped": + calculated_df = calculated_df.assign( + ungrouped="ungrouped", + ) + metric_name = self.snake_case_metric_name() df = ( pd.DataFrame(self.calculate_metric(calculated_df), columns=[metric_name]) .assign( - **{ - # f"tolerance_{metric_name}": self.metric_tolerance, # tol is just for reporting so you can know of off you are + **{ # totolerance_ is just for reporting so you can know of off you are + f"tolerance_{metric_name}": self.metric_tolerance, f"is_error_{metric_name}": lambda x: x[metric_name] > self.metric_tolerance, } @@ -1263,9 +1312,17 @@ def check(self: Self, calculated_df) -> pd.DataFrame: .assign(group=self.by) .reset_index() .rename(columns={self.by: "group_value"}) - .set_index(["group", "group_value"]) ) - return df + # we want to set the index values as the same for all groups, but both the + # ungrouped and table_name group require a special exception because we need + # to add table_name into the columns + if self.by == "ungrouped": + df = df.assign(table_name="ungrouped") + if self.by == "table_name": + # we end up having two columns w/ table_name values in order to keep all the + # outputs having the same indexes. + df = df.assign(table_name=lambda x: x["group_value"]) + return df.set_index(["group", "table_name", "group_value"]) class ErrorFrequency(GroupMetricCheck): @@ -1315,8 +1372,8 @@ class NullCalculationFrequency(GroupMetricCheck): def calculate_metric(self: Self, df: pd.DataFrame) -> pd.Series: """Calculate the frequency with which records are tagged as errors.""" return ( - df[(df.row_type_xbrl == "calculated_value") & (df.is_within_table_calc)] - .groupby(self.by) + df[df.row_type_xbrl == "calculated_value"] + .groupby(self.groupby_by()) .apply(self.metric) ) @@ -1341,7 +1398,7 @@ def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: def add_corrections( calculated_df: pd.DataFrame, value_col: str, - calculation_tolerance: CalculationChecks, + calculation_checks: CalculationChecks, table_name: str, ) -> pd.DataFrame: """Add corrections to discrepancies between reported & calculated values. @@ -1363,8 +1420,8 @@ def add_corrections( ~np.isclose( calculated_df["calculated_value"], calculated_df[value_col], - rtol=calculation_tolerance.isclose_rtol, - atol=calculation_tolerance.isclose_atol, + rtol=calculation_checks.isclose_rtol, + atol=calculation_checks.isclose_atol, ) & (calculated_df["abs_diff"].notnull()) ] @@ -2034,12 +2091,12 @@ def add_calculation_corrections( weight=1, ) ) - # for every calc component, also make the parent-only version - correction_parents = correction_components.assign( - table_name_parent=lambda t: t.table_name, - xbrl_factoid_parent=lambda x: x.xbrl_factoid, - ).drop(columns=["table_name", "xbrl_factoid"]) - return pd.concat([calc_components, correction_components, correction_parents]) + # # for every calc component, also make the parent-only version + # correction_parents = correction_components.assign( + # table_name_parent=lambda t: t.table_name, + # xbrl_factoid_parent=lambda x: x.xbrl_factoid, + # ).drop(columns=["table_name", "xbrl_factoid"]) + return pd.concat([calc_components, correction_components]) def get_xbrl_calculation_fixes(self: Self) -> pd.DataFrame: """Grab the XBRL calculation file.""" @@ -2182,7 +2239,7 @@ def process_xbrl_metadata_calculations( calc_fixes=self.get_xbrl_calculation_fixes(), ) .drop_duplicates(keep="first") - # .pipe(self.add_calculation_corrections) + .pipe(self.add_calculation_corrections) ) # this is really a xbrl_factoid-level flag, but we need it while using this # calc components. @@ -5928,6 +5985,15 @@ def table_to_xbrl_factoid_name() -> dict[str, str]: } +def table_to_column_to_check() -> dict[str, str]: + """Build a dictionary of table name (keys) to ``xbrl_factiod`` column name.""" + return { + table_name: transformer().params.reconcile_table_calculations.column_to_check + for (table_name, transformer) in FERC1_TFR_CLASSES.items() + if transformer().params.reconcile_table_calculations.column_to_check + } + + @asset( ins={ table_name: AssetIn(table_name) @@ -6090,7 +6156,12 @@ def calculation_components_xbrl_ferc1(**kwargs) -> pd.DataFrame: f"columns are identical and expected 0.\n{parent_child_dupes=}" ) - assert unexpected_total_components(calc_components, dimensions).empty + if not ( + unexpected_totals := unexpected_total_components( + calc_components.convert_dtypes(), dimensions + ) + ).empty: + raise AssertionError(f"Found unexpected total records: {unexpected_totals}") # Remove convert_dtypes() once we're writing to the DB using enforce_schema() return calc_components.convert_dtypes() @@ -6245,8 +6316,8 @@ def make_calculation_dimensions_explicit( # astypes dealing w/ future warning regarding empty or all null dfs calc_comps_w_dims = pd.concat( [ - calc_comps_w_implied_dims.astype(calc_comps_w_explicit_dims.dtypes), - calc_comps_w_explicit_dims.astype(calc_comps_w_implied_dims.dtypes), + calc_comps_w_implied_dims.convert_dtypes(), + calc_comps_w_explicit_dims.convert_dtypes(), ] ) return calc_comps_w_dims diff --git a/src/pudl/transform/params/ferc1.py b/src/pudl/transform/params/ferc1.py index d331918e0f..70f0ad1fa6 100644 --- a/src/pudl/transform/params/ferc1.py +++ b/src/pudl/transform/params/ferc1.py @@ -2730,7 +2730,13 @@ "reconcile_table_calculations": { "column_to_check": "ending_balance", "calculation_checks": { - "group_checks": {"ungrouped": {"error_frequency": 0.08}}, + "group_checks": { + "ungrouped": {"error_frequency": 0.08}, + "utility_id_ferc1": { + "error_frequency": 0.038, + "relative_error_magnitude": 0.11, + }, + }, }, }, }, @@ -3331,6 +3337,19 @@ "reconcile_table_calculations": { "column_to_check": "ending_balance", "subtotal_column": "utility_type", + "calculation_checks": { + "group_checks": { + "ungrouped": {"error_frequency": 0.0012}, + "report_year": { + "null_calculation_frequency": 1.0, + "relative_error_magnitude": 0.041, + }, + "utility_id_ferc1": { + "error_frequency": 0.067, + "null_calculation_frequency": 1.0, + }, + }, + }, }, }, "balance_sheet_assets_ferc1": { @@ -3508,6 +3527,12 @@ "reconcile_table_calculations": { "column_to_check": "dollar_value", "subtotal_column": "plant_function", + "calculation_checks": { + "group_checks": { + "ungrouped": {"null_calculation_frequency": 0.84}, + "report_year": {"null_calculation_frequency": 1.0}, + }, + }, }, }, "electric_operating_revenues_ferc1": { @@ -3639,7 +3664,23 @@ ], }, ], - "reconcile_table_calculations": {"column_to_check": "dollar_value"}, + "reconcile_table_calculations": { + "column_to_check": "dollar_value", + "calculation_checks": { + "group_checks": { + "ungrouped": {"error_frequency": 0.0031}, + "xbrl_factoid": { + "error_frequency": 0.04, + "null_calculation_frequency": 1.0, + "relative_error_magnitude": 0.028, + }, # other_operating_revenues bad + "utility_id_ferc1": { + "error_frequency": 0.17, + "null_calculation_frequency": 1.0, + }, + }, + }, + }, }, "retained_earnings_ferc1": { "rename_columns_ferc1": { @@ -3885,6 +3926,17 @@ "reconcile_table_calculations": { "column_to_check": "dollar_value", "subtotal_column": "utility_type", + "calculation_checks": { + "group_checks": { + "ungrouped": {"null_calculation_frequency": 0.74}, + "report_year": {"null_calculation_frequency": 1.0}, + "utility_id_ferc1": { + "error_frequency": 0.042, + "relative_error_magnitude": 0.22, + "null_calculation_frequency": 1.0, + }, + }, + }, }, }, "electric_plant_depreciation_changes_ferc1": { @@ -3987,7 +4039,21 @@ # Note: this table does not currently get exploded. It will require # additional debugging at a later date. "calculation_checks": { - "group_checks": {"ungrouped": {"error_frequency": 0.4}}, + "group_checks": { + "ungrouped": {"error_frequency": 0.056}, + "report_year": { + "error_frequency": 0.078, + "relative_error_magnitude": 0.073, + }, + "utility_id_ferc1": { + "error_frequency": 0.13, + "relative_error_magnitude": 0.66, + }, + "xbrl_factoid": { + "error_frequency": 0.43, + "relative_error_magnitude": 0.078, + }, # ending_balance is bad + }, }, }, }, @@ -4436,7 +4502,23 @@ "rename_columns": {"xbrl_factoid": "expense_type"}, "on": "expense_type", }, - "reconcile_table_calculations": {"column_to_check": "dollar_value"}, + "reconcile_table_calculations": { + "column_to_check": "dollar_value", + "calculation_checks": { + "group_checks": { + "ungrouped": {"relative_error_magnitude": 0.002}, + "report_year": {"relative_error_magnitude": 0.042}, + "xbrl_factoid": { + "error_frequency": 0.018, + "relative_error_magnitude": 0.028, + }, + "utility_id_ferc1": { + "error_frequency": 0.011, + "relative_error_magnitude": 0.066, + }, + }, + }, + }, }, "other_regulatory_liabilities_ferc1": { "rename_columns_ferc1": { diff --git a/test/unit/transform/ferc1_test.py b/test/unit/transform/ferc1_test.py index 9390ccc70f..434539e398 100644 --- a/test/unit/transform/ferc1_test.py +++ b/test/unit/transform/ferc1_test.py @@ -554,6 +554,7 @@ def test_make_calculation_dimensions_explicit(): table_dimensions_ferc1=table_dimensions_trek, dimensions=["dim_x", "dim_y"], ) + .convert_dtypes() .sort_values(calc_comp_idx) .reset_index(drop=True) ) @@ -579,7 +580,7 @@ def test_make_calculation_dimensions_explicit(): table_a,fact_2,table_b,fact_8,next_gen,futile """ ) - ) + ).convert_dtypes() pd.testing.assert_frame_equal(out_trek, expected_trek) # swap the order of the dims to test whether the input order effects the result out_reordered = ( From 7b2b794bd6d78451e200ae9fd828d8cc3d2a5dfd Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Tue, 24 Oct 2023 13:18:22 -0400 Subject: [PATCH 24/37] set explosion inter table calc tolerances & fix lil null tags thing --- src/pudl/output/ferc1.py | 73 +++++++++++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 15 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index f6a4ef6de9..3ccae34f84 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -27,17 +27,62 @@ EXPLOSION_CALCULATION_TOLERANCES: dict[str, CalculationChecks] = { "income_statement_ferc1": CalculationChecks( group_checks=CalculationGroupChecks( - ungrouped=CalculationMetricTolerance(error_frequency=0.20) + ungrouped=CalculationMetricTolerance( + error_frequency=0.20, null_calculation_frequency=1.0 + ), + report_year=CalculationMetricTolerance( + error_frequency=0.036, + relative_error_magnitude=0.048, + null_calculation_frequency=1.0, + ), + xbrl_factoid=CalculationMetricTolerance( + error_frequency=0.35, + relative_error_magnitude=0.17, + null_calculation_frequency=1.0, + ), + utility_id_ferc1=CalculationMetricTolerance( + error_frequency=0.13, + relative_error_magnitude=0.42, + null_calculation_frequency=1.0, + ), ) ), "balance_sheet_assets_ferc1": CalculationChecks( group_checks=CalculationGroupChecks( - ungrouped=CalculationMetricTolerance(error_frequency=0.65) + ungrouped=CalculationMetricTolerance( + error_frequency=0.013, null_calculation_frequency=1.0 + ), + report_year=CalculationMetricTolerance( + error_frequency=0.12, null_calculation_frequency=1.0 + ), + xbrl_factoid=CalculationMetricTolerance( + error_frequency=0.37, + relative_error_magnitude=0.22, + null_calculation_frequency=1.0, + ), + utility_id_ferc1=CalculationMetricTolerance( + error_frequency=0.21, + relative_error_magnitude=0.26, + null_calculation_frequency=1.0, + ), ) ), "balance_sheet_liabilities_ferc1": CalculationChecks( group_checks=CalculationGroupChecks( - ungrouped=CalculationMetricTolerance(error_frequency=0.07) + ungrouped=CalculationMetricTolerance( + error_frequency=0.07, null_calculation_frequency=1.0 + ), + report_year=CalculationMetricTolerance( + error_frequency=0.028, null_calculation_frequency=1.0 + ), + xbrl_factoid=CalculationMetricTolerance( + error_frequency=0.028, + relative_error_magnitude=0.019, + null_calculation_frequency=1.0, + ), + utility_id_ferc1=CalculationMetricTolerance( + error_frequency=0.063, null_calculation_frequency=1.0 + ), ) ), } @@ -1470,15 +1515,12 @@ def reconcile_intertable_calculations( value_col=self.value_col, ) calculated_df = pudl.transform.ferc1.check_calculation_metrics( - calculated_df=calculated_df, - value_col=self.value_col, - calculation_tolerance=self.calculation_tolerance, - table_name=self.root_table, + calculated_df=calculated_df, calculation_checks=self.calculation_tolerance ) calculated_df = pudl.transform.ferc1.add_corrections( calculated_df=calculated_df, value_col=self.value_col, - calculation_tolerance=self.calculation_tolerance, + calculation_checks=pudl.transform.ferc1.MetricInputs(), table_name=self.root_table, ) logger.info("Checking sub-total calcs.") @@ -1492,9 +1534,7 @@ def reconcile_intertable_calculations( ) subtotal_calcs = pudl.transform.ferc1.check_calculation_metrics( calculated_df=subtotal_calcs, - value_col=self.value_col, - calculation_tolerance=self.calculation_tolerance, - table_name=self.root_table, + calculation_checks=self.calculation_tolerance, ) return calculated_df @@ -1780,8 +1820,8 @@ def check_conflicting_tags(annotated_forest: nx.DiGraph) -> None: nodes = annotated_forest.nodes for ancestor in nodes: for descendant in nx.descendants(annotated_forest, ancestor): - for tag in nodes[ancestor]["tags"]: - if tag in nodes[descendant]["tags"]: + for tag in nodes[ancestor].get("tags", {}): + if tag in nodes[descendant].get("tags", {}): ancestor_tag_value = nodes[ancestor]["tags"][tag] descendant_tag_value = nodes[descendant]["tags"][tag] if ancestor_tag_value != descendant_tag_value: @@ -2064,7 +2104,7 @@ def leafy_meta(self: Self) -> pd.DataFrame: leaf_tags = {} ancestors = list(nx.ancestors(self.annotated_forest, leaf)) + [leaf] for node in ancestors: - leaf_tags |= self.annotated_forest.nodes[node]["tags"] + leaf_tags |= self.annotated_forest.nodes[node].get("tags", {}) all_leaf_weights = { self._get_path_weight(path, self.annotated_forest) for path in nx.all_simple_paths( @@ -2268,5 +2308,8 @@ def nodes_to_df(calc_forest: nx.DiGraph, nodes: list[NodeId]) -> pd.DataFrame: } index = pd.DataFrame(node_dict.keys()).astype("string") data = pd.DataFrame(node_dict.values()) - tags = pd.json_normalize(data.tags).astype("string") + try: + tags = pd.json_normalize(data.tags).astype("string") + except AttributeError: + tags = pd.DataFrame() return pd.concat([index, tags], axis="columns") From d65c0aa633a7a1f8deb18195cf12457767f4caa7 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 27 Oct 2023 11:05:07 -0400 Subject: [PATCH 25/37] add mega metrics asset and update docs from pr comments bc the is_not_close inputs have changed, lots of the metrics are failing rn... stil need to track all of that down --- src/pudl/output/ferc1.py | 98 ++++++--- src/pudl/transform/ferc1.py | 335 ++++++++++++++++++++--------- src/pudl/transform/params/ferc1.py | 32 +-- 3 files changed, 312 insertions(+), 153 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 3ccae34f84..0f5f602c56 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -16,74 +16,104 @@ import pudl from pudl.transform.ferc1 import ( - CalculationChecks, - CalculationGroupChecks, - CalculationMetricTolerance, + GroupMetricChecks, + GroupMetricTolerances, + MetricTolerances, ) logger = pudl.logging_helpers.get_logger(__name__) -EXPLOSION_CALCULATION_TOLERANCES: dict[str, CalculationChecks] = { - "income_statement_ferc1": CalculationChecks( - group_checks=CalculationGroupChecks( - ungrouped=CalculationMetricTolerance( - error_frequency=0.20, null_calculation_frequency=1.0 +EXPLOSION_CALCULATION_TOLERANCES: dict[str, GroupMetricChecks] = { + "income_statement_ferc1": GroupMetricChecks( + groups_to_check=[ + "ungrouped", + "report_year", + "xbrl_factoid", + "utility_id_ferc1", + ], + group_metric_tolerances=GroupMetricTolerances( + ungrouped=MetricTolerances( + error_frequency=0.02, + relative_error_magnitude=0.04, + null_calculation_frequency=1.0, ), - report_year=CalculationMetricTolerance( + report_year=MetricTolerances( error_frequency=0.036, relative_error_magnitude=0.048, null_calculation_frequency=1.0, ), - xbrl_factoid=CalculationMetricTolerance( + xbrl_factoid=MetricTolerances( error_frequency=0.35, relative_error_magnitude=0.17, null_calculation_frequency=1.0, ), - utility_id_ferc1=CalculationMetricTolerance( + utility_id_ferc1=MetricTolerances( error_frequency=0.13, relative_error_magnitude=0.42, null_calculation_frequency=1.0, ), - ) + ), ), - "balance_sheet_assets_ferc1": CalculationChecks( - group_checks=CalculationGroupChecks( - ungrouped=CalculationMetricTolerance( - error_frequency=0.013, null_calculation_frequency=1.0 + "balance_sheet_assets_ferc1": GroupMetricChecks( + groups_to_check=[ + "ungrouped", + "report_year", + "xbrl_factoid", + "utility_id_ferc1", + ], + group_metric_tolerances=GroupMetricTolerances( + ungrouped=MetricTolerances( + error_frequency=0.013, + relative_error_magnitude=0.04, + null_calculation_frequency=1.0, ), - report_year=CalculationMetricTolerance( - error_frequency=0.12, null_calculation_frequency=1.0 + report_year=MetricTolerances( + error_frequency=0.12, + relative_error_magnitude=0.04, + null_calculation_frequency=1.0, ), - xbrl_factoid=CalculationMetricTolerance( + xbrl_factoid=MetricTolerances( error_frequency=0.37, relative_error_magnitude=0.22, null_calculation_frequency=1.0, ), - utility_id_ferc1=CalculationMetricTolerance( + utility_id_ferc1=MetricTolerances( error_frequency=0.21, relative_error_magnitude=0.26, null_calculation_frequency=1.0, ), - ) + ), ), - "balance_sheet_liabilities_ferc1": CalculationChecks( - group_checks=CalculationGroupChecks( - ungrouped=CalculationMetricTolerance( - error_frequency=0.07, null_calculation_frequency=1.0 + "balance_sheet_liabilities_ferc1": GroupMetricChecks( + groups_to_check=[ + "ungrouped", + "report_year", + "xbrl_factoid", + "utility_id_ferc1", + ], + group_metric_tolerances=GroupMetricTolerances( + ungrouped=MetricTolerances( + error_frequency=0.07, + relative_error_magnitude=0.04, + null_calculation_frequency=1.0, ), - report_year=CalculationMetricTolerance( - error_frequency=0.028, null_calculation_frequency=1.0 + report_year=MetricTolerances( + error_frequency=0.028, + relative_error_magnitude=0.04, + null_calculation_frequency=1.0, ), - xbrl_factoid=CalculationMetricTolerance( + xbrl_factoid=MetricTolerances( error_frequency=0.028, relative_error_magnitude=0.019, null_calculation_frequency=1.0, ), - utility_id_ferc1=CalculationMetricTolerance( - error_frequency=0.063, null_calculation_frequency=1.0 + utility_id_ferc1=MetricTolerances( + error_frequency=0.063, + relative_error_magnitude=0.04, + null_calculation_frequency=1.0, ), - ) + ), ), } @@ -1016,7 +1046,7 @@ def exploded_table_asset_factory( root_table: str, table_names_to_explode: list[str], seed_nodes: list[NodeId], - calculation_tolerance: CalculationChecks, + calculation_tolerance: GroupMetricChecks, io_manager_key: str | None = None, ) -> AssetsDefinition: """Create an exploded table based on a set of related input tables.""" @@ -1146,7 +1176,7 @@ def __init__( calculation_components_xbrl_ferc1: pd.DataFrame, seed_nodes: list[NodeId], tags: pd.DataFrame = pd.DataFrame(), - calculation_tolerance: CalculationChecks = CalculationChecks(), + calculation_tolerance: GroupMetricChecks = GroupMetricChecks(), ): """Instantiate an Exploder class. @@ -1586,7 +1616,7 @@ class XbrlCalculationForestFerc1(BaseModel): exploded_calcs: pd.DataFrame = pd.DataFrame() seeds: list[NodeId] = [] tags: pd.DataFrame = pd.DataFrame() - calculation_tolerance: CalculationChecks = CalculationChecks() + calculation_tolerance: GroupMetricChecks = GroupMetricChecks() class Config: """Allow the class to store a dataframe.""" diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index ef30debd86..50ee50ef6c 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -727,7 +727,7 @@ def combine_axis_columns_xbrl( return df -class MetricInputs(TransformParams): +class IsCloseTolerance(TransformParams): """Info for testing a particular check.""" isclose_rtol: confloat(ge=0.0) = 1e-5 @@ -737,29 +737,29 @@ class MetricInputs(TransformParams): """Absolute tolerance to use in :func:`np.isclose` for determining equality.""" -class CalculationMetricInputs(TransformParams): +class CalculationIsCloseTolerance(TransformParams): """Calc params organized by check type.""" - error_frequency: MetricInputs = MetricInputs() - relative_error_magnitude: MetricInputs = MetricInputs(isclose_atol=1e-9) - absolute_error_magnitude: MetricInputs = MetricInputs() - null_calculation_frequency: MetricInputs = MetricInputs() - null_reported_value_frequency: MetricInputs = MetricInputs() + error_frequency: IsCloseTolerance = IsCloseTolerance() + relative_error_magnitude: IsCloseTolerance = IsCloseTolerance(isclose_atol=1e-3) + null_calculated_value_frequency: IsCloseTolerance = IsCloseTolerance() + null_calculation_frequency: IsCloseTolerance = IsCloseTolerance() + null_reported_value_frequency: IsCloseTolerance = IsCloseTolerance() -class CalculationMetricTolerance(TransformParams): +class MetricTolerances(TransformParams): """Tolerances for all data checks to be preformed within a grouped df.""" error_frequency: confloat(ge=0.0, le=1.0) = 0.01 relative_error_magnitude: confloat(ge=0.0) = 0.04 - absolute_error_magnitude: confloat(ge=0.0) = np.inf null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.7 """Fraction of records with non-null reported values and null calculated values.""" + null_calculated_value_frequency: confloat(ge=0.0) = np.inf null_reported_value_frequency: confloat(ge=0.0, le=1.0) = 1.0 # ooof this one is just bad -class CalculationGroupChecks(TransformParams): +class GroupMetricTolerances(TransformParams): """Data quality expectations related to FERC 1 calculations. We are doing a lot of comparisons between calculated and reported values to identify @@ -785,43 +785,43 @@ class CalculationGroupChecks(TransformParams): want to know about it! """ - ungrouped: CalculationMetricTolerance = CalculationMetricTolerance( + ungrouped: MetricTolerances = MetricTolerances( error_frequency=0.0005, relative_error_magnitude=0.001, null_calculation_frequency=0.50, null_reported_value_frequency=0.68, ) - xbrl_factoid: CalculationMetricTolerance = CalculationMetricTolerance( + xbrl_factoid: MetricTolerances = MetricTolerances( error_frequency=0.018, relative_error_magnitude=0.0086, null_calculation_frequency=1.0, ) - utility_id_ferc1: CalculationMetricTolerance = CalculationMetricTolerance( + utility_id_ferc1: MetricTolerances = MetricTolerances( error_frequency=0.038, + relative_error_magnitude=0.04, null_calculation_frequency=1.0, ) - report_year: CalculationMetricTolerance = CalculationMetricTolerance( - error_frequency=0.006 + report_year: MetricTolerances = MetricTolerances( + error_frequency=0.006, + relative_error_magnitude=0.04, + null_calculation_frequency=0.7, ) - table_name: CalculationMetricTolerance = CalculationMetricTolerance( + table_name: MetricTolerances = MetricTolerances( error_frequency=0.0005, relative_error_magnitude=0.001, null_calculation_frequency=0.50, null_reported_value_frequency=0.68, ) - # @validator("ungrouped", "utility_id_ferc1", "report_year", "table_name", "xbrl_factoid") - # def grouped_tol_ge_ungrouped_tol(v, values): - # """Grouped tolerance should always be greater than or equal to ungrouped.""" - # if v is not None: - # assert v >= values["ungrouped"] - # return v - -class CalculationChecks(TransformParams): +class GroupMetricChecks(TransformParams): """Input for checking calculations organized by group and test.""" - groups_to_check: list[str] = [ + groups_to_check: list[ + Literal[ + "ungrouped", "table_name", "xbrl_factoid", "utility_id_ferc1", "report_year" + ] + ] = [ "ungrouped", "report_year", "xbrl_factoid", @@ -833,8 +833,25 @@ class CalculationChecks(TransformParams): "null_calculation_frequency", "null_reported_value_frequency", ] - group_checks: CalculationGroupChecks = CalculationGroupChecks() - metric_inputs: CalculationMetricInputs = CalculationMetricInputs() + group_metric_tolerances: GroupMetricTolerances = GroupMetricTolerances() + is_close_tolerance: CalculationIsCloseTolerance = CalculationIsCloseTolerance() + + # @root_validator + # def grouped_tol_ge_ungrouped_tol(cls, values): + # """Grouped tolerance should always be greater than or equal to ungrouped.""" + # group_metric_tolerances = values["group_metric_tolerances"] + # groups_to_check = values["groups_to_check"] + # for group in groups_to_check: + # metric_tolerances = group_metric_tolerances.dict().get(group) + # for metric_name, tolerance in metric_tolerances.items(): + # ungrouped_tolerance = group_metric_tolerances.dict()["ungrouped"].get( + # metric_name + # ) + # if tolerance < ungrouped_tolerance: + # raise AssertionError( + # f"In {group=}, {tolerance=} for {metric_name} should be greater than {ungrouped_tolerance=}." + # ) + # return values class ReconcileTableCalculations(TransformParams): @@ -846,7 +863,7 @@ class ReconcileTableCalculations(TransformParams): This will typically be ``dollar_value`` or ``ending_balance`` column for the income statement and the balance sheet tables. """ - calculation_checks: CalculationChecks = CalculationChecks() + group_metric_checks: GroupMetricChecks = GroupMetricChecks() """Fraction of calculated values which we allow not to match reported values.""" subtotal_column: str | None = None @@ -947,12 +964,12 @@ def reconcile_table_calculations( ) .pipe( check_calculation_metrics, - calculation_checks=params.calculation_checks, + group_metric_checks=params.group_metric_checks, ) .pipe( add_corrections, value_col=params.column_to_check, - calculation_checks=MetricInputs(), + group_metric_checks=IsCloseTolerance(), table_name=table_name, ) # Rename back to the original xbrl_factoid column name before returning: @@ -1003,7 +1020,7 @@ def _check_subtotal_calculations( ) subtotal_calcs = check_calculation_metrics( calculated_df=subtotal_calcs, - calculation_checks=params.calculation_checks, + group_metric_checks=params.group_metric_checks, ) @@ -1144,9 +1161,9 @@ def calculate_values_from_components( return calculated_df -def calculation_check_results_by_group( +def check_calculation_metrics_by_group( calculated_df: pd.DataFrame, - calculation_checks: CalculationChecks, + group_metric_checks: GroupMetricChecks, ) -> pd.DataFrame: """Tabulate the results of the calculation checks by group. @@ -1159,34 +1176,54 @@ def calculation_check_results_by_group( results_dfs = {} # for each groupby grouping: calculate metrics for each test # then check if each test is within acceptable tolerance levels - for group_name in calculation_checks.groups_to_check: + for group_name in group_metric_checks.groups_to_check: logger.info(group_name) group_metrics = {} - for metric_name, metric_tolerance in calculation_checks.group_checks.dict()[ - group_name - ].items(): - if metric_name in calculation_checks.metrics_to_check: + for ( + metric_name, + metric_tolerance, + ) in group_metric_checks.group_metric_tolerances.dict()[group_name].items(): + if metric_name in group_metric_checks.metrics_to_check: # this feels icky. the param name for the metrics are all snake_case while # the metric classes are all TitleCase. So we convert to TitleCase title_case_test = metric_name.title().replace("_", "") - tester = globals()[title_case_test]( + group_metric_checker = globals()[title_case_test]( by=group_name, - metric_inputs=calculation_checks.metric_inputs.dict()[metric_name], + is_close_tolerance=group_metric_checks.is_close_tolerance.dict()[ + metric_name + ], metric_tolerance=metric_tolerance, ) - group_metrics[metric_name] = tester.check(calculated_df=calculated_df) - results_dfs[group_name] = pd.concat(group_metrics.values(), axis=1) - results = pd.concat(results_dfs.values()) + group_metric = group_metric_checker.check( + calculated_df=calculated_df + ).rename(columns={group_name: "group_value"}) + # we want to set the index values as the same for all groups, but both the + # ungrouped and table_name group require a special exception because we need + # to add table_name into the columns + if group_name == "ungrouped": + group_metric = group_metric.assign(table_name="ungrouped") + if group_name == "table_name": + # we end up having two columns w/ table_name values in order to keep all the + # outputs having the same indexes. + group_metric = group_metric.assign( + table_name=lambda x: x["group_value"] + ) + # make a uniform multi-index w/ group name and group values + group_metrics[metric_name] = group_metric.set_index( + ["group", "table_name", "group_value"] + ) + results_dfs[group_name] = pd.concat(group_metrics.values(), axis="columns") + results = pd.concat(results_dfs.values(), axis="index") return results def check_calculation_metrics( calculated_df: pd.DataFrame, - calculation_checks: CalculationChecks, + group_metric_checks: GroupMetricChecks, ) -> pd.DataFrame: """Run the calculation metrics and determine if calculations are within tolerance.""" # DO ERROR CHECKS - results = calculation_check_results_by_group(calculated_df, calculation_checks) + results = check_calculation_metrics_by_group(calculated_df, group_metric_checks) # get the records w/ errors in any! of their checks errors = results[results.filter(like="is_error").any(axis=1)] if not errors.empty: @@ -1206,25 +1243,31 @@ def check_calculation_metrics( ######################################################################################## -class GroupMetricCheck(BaseModel): +class ErrorMetric(BaseModel): """Base class for checking a particular metric within a group.""" by: Literal[ "ungrouped", "table_name", "xbrl_factoid", "utility_id_ferc1", "report_year" - ] # right now this only works with one column! bc of index rename in check() + ] """Name of group to check the metric based on. + Right now this only works with one column! Mostly because the outputs of :meth:`check` + are expected to be structured similarly and :meth:`groupby_cols` expects ``by`` to + be one column name. + Most groups are column names to use in groupby along with the ``table_name``. Both - ``ungrouped`` and ``table_name`` have some special exceptions in order to output the - metric results with the same index of ``table_name`` and ``group_value``. For the - ``ungrouped`` group, the values in ``table_name`` and ``group_value`` will be - ``ungrouped``. For the ``table_name`` group, the values in ``table_name`` and - ``group_value`` will be the table name values. + ``ungrouped`` and ``table_name`` have some special exceptions. ``ungrouped`` is not + a native column, so we assign an ``ungrouped`` column witin :meth:`check`. While all + other ``by`` values get combined with ``table_name`` in :meth:`groupby_cols`, + ``ungrouped`` and ``table_name`` are the only columns used in the groupby. """ - metric_inputs: MetricInputs - """Inputs for the metric to determine :meth:`is_not_close`. Instance of :class:`MetricInputs`.""" + + is_close_tolerance: IsCloseTolerance + """Inputs for the metric to determine :meth:`is_not_close`. Instance of :class:`IsCloseTolerance`.""" + metric_tolerance: float """Tolerance for checking the metric within the ``by`` group.""" + required_cols: list[str] = [ "table_name", "xbrl_factoid", @@ -1236,7 +1279,7 @@ class GroupMetricCheck(BaseModel): "rel_diff", ] - def check_required_cols_df(self: Self, df: pd.DataFrame): + def has_required_cols(self: Self, df: pd.DataFrame): """Check that the input dataframe has all required columns.""" missing_required_cols = [ col for col in self.required_cols if col not in df.columns @@ -1245,14 +1288,15 @@ def check_required_cols_df(self: Self, df: pd.DataFrame): raise AssertionError( f"The table is missing the following required columns: {missing_required_cols}" ) + return True @abstractmethod def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: - """Apply method for each individual test.""" + """Metric function that will be applied to each group of values being checked.""" ... def is_not_close(self, df: pd.DataFrame) -> pd.Series: - """Flag those records that qualify as an error. + """Flag records where reported and calculated values differ significantly. NOTE: How should this categorization vary from metric to metric? Why is it reasonable to only look at records where ``abs_diff`` is not Null? @@ -1261,13 +1305,14 @@ def is_not_close(self, df: pd.DataFrame) -> pd.Series: ~np.isclose( df["calculated_value"], df["reported_value"], - rtol=self.metric_inputs.isclose_rtol, - atol=self.metric_inputs.isclose_atol, + rtol=self.is_close_tolerance.isclose_rtol, + atol=self.is_close_tolerance.isclose_atol, ) - & df["abs_diff"].notnull() + # & df["abs_diff"].notnull() + & (df.row_type_xbrl == "calculated_value") ) - def groupby_by(self: Self) -> list[str]: + def groupby_cols(self: Self) -> list[str]: """The list of columns to group by. We want to default to adding the table_name into all groupby's, but two of our @@ -1278,19 +1323,24 @@ def groupby_by(self: Self) -> list[str]: gb_by = [self.by] return gb_by - def calculate_metric(self: Self, df: pd.DataFrame) -> pd.Series: - """Calculate the frequency with which records are tagged as errors.""" + def apply_metric(self: Self, df: pd.DataFrame) -> pd.Series: + """Generate the metric values within each group through an apply method. + + This method adds a column ``is_not_close`` into the df before the groupby + because that column is used in many of the :meth:`metric`. + """ + # return a df instead of a series df["is_not_close"] = self.is_not_close(df) - return df.groupby(by=self.groupby_by()).apply(self.metric) + return df.groupby(by=self.groupby_cols()).apply(self.metric) - def snake_case_metric_name(self: Self) -> str: + def _snake_case_metric_name(self: Self) -> str: """Convert the TitleCase class name to a snake_case string.""" class_name = self.__class__.__name__ return re.sub("(?!^)([A-Z]+)", r"_\1", class_name).lower() def check(self: Self, calculated_df) -> pd.DataFrame: """Make a df w/ the metric, tolerance and is_error columns.""" - self.check_required_cols_df(calculated_df) + self.has_required_cols(calculated_df) # ungrouped is special because the rest of the stock group names are column # names. if self.by == "ungrouped": @@ -1298,9 +1348,9 @@ def check(self: Self, calculated_df) -> pd.DataFrame: ungrouped="ungrouped", ) - metric_name = self.snake_case_metric_name() + metric_name = self._snake_case_metric_name() df = ( - pd.DataFrame(self.calculate_metric(calculated_df), columns=[metric_name]) + pd.DataFrame(self.apply_metric(calculated_df), columns=[metric_name]) .assign( **{ # totolerance_ is just for reporting so you can know of off you are f"tolerance_{metric_name}": self.metric_tolerance, @@ -1308,24 +1358,13 @@ def check(self: Self, calculated_df) -> pd.DataFrame: > self.metric_tolerance, } ) - # make a uniform multi-index w/ group name and group values .assign(group=self.by) .reset_index() - .rename(columns={self.by: "group_value"}) ) - # we want to set the index values as the same for all groups, but both the - # ungrouped and table_name group require a special exception because we need - # to add table_name into the columns - if self.by == "ungrouped": - df = df.assign(table_name="ungrouped") - if self.by == "table_name": - # we end up having two columns w/ table_name values in order to keep all the - # outputs having the same indexes. - df = df.assign(table_name=lambda x: x["group_value"]) - return df.set_index(["group", "table_name", "group_value"]) + return df -class ErrorFrequency(GroupMetricCheck): +class ErrorFrequency(ErrorMetric): """Check error frequency in XBRL calculations.""" def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: @@ -1342,19 +1381,18 @@ def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: return out -class RelativeErrorMagnitude(GroupMetricCheck): +class RelativeErrorMagnitude(ErrorMetric): """Check relative magnitude of errors in XBRL calculations.""" def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: """Calculate the mangnitude of the errors relative to total reported value.""" try: - # Should we be taking the absolute value of the reported column? - return gb[gb.is_not_close].abs_diff.sum() / gb["reported_value"].abs().sum() + return gb.abs_diff.abs().sum() / gb["reported_value"].abs().sum() except ZeroDivisionError: return np.nan -class AbsoluteErrorMagnitude(GroupMetricCheck): +class NullCalculatedValueFrequency(ErrorMetric): """Check relative magnitude of errors in XBRL calculations. These numbers may vary wildly from table to table so no default values for the @@ -1363,22 +1401,22 @@ class AbsoluteErrorMagnitude(GroupMetricCheck): def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: """Calculate the absolute mangnitude of XBRL calculation errors.""" - return gb.abs_diff.sum() + return gb.abs_diff.abs().sum() -class NullCalculationFrequency(GroupMetricCheck): +class NullCalculationFrequency(ErrorMetric): """Check the frequency of null calculated values.""" - def calculate_metric(self: Self, df: pd.DataFrame) -> pd.Series: - """Calculate the frequency with which records are tagged as errors.""" + def apply_metric(self: Self, df: pd.DataFrame) -> pd.Series: + """Only apply metric to rows that contain calculated values.""" return ( df[df.row_type_xbrl == "calculated_value"] - .groupby(self.groupby_by()) + .groupby(self.groupby_cols()) .apply(self.metric) ) def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: - """Frequency of null calculated values when reported values are not.""" + """Fraction of non-null reported values that have null corresponding calculated values.""" non_null_reported = gb["reported_value"].notnull() null_calculated = gb["calculated_value"].isnull() try: @@ -1387,7 +1425,7 @@ def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: return np.nan -class NullReportedValueFrequency(GroupMetricCheck): +class NullReportedValueFrequency(ErrorMetric): """Check the frequency of null reported values.""" def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: @@ -1398,7 +1436,7 @@ def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: def add_corrections( calculated_df: pd.DataFrame, value_col: str, - calculation_checks: CalculationChecks, + group_metric_checks: GroupMetricChecks, table_name: str, ) -> pd.DataFrame: """Add corrections to discrepancies between reported & calculated values. @@ -1420,11 +1458,11 @@ def add_corrections( ~np.isclose( calculated_df["calculated_value"], calculated_df[value_col], - rtol=calculation_checks.isclose_rtol, - atol=calculation_checks.isclose_atol, + rtol=group_metric_checks.isclose_rtol, + atol=group_metric_checks.isclose_atol, ) & (calculated_df["abs_diff"].notnull()) - ] + ].copy() corrections[value_col] = ( corrections[value_col].fillna(0.0) - corrections["calculated_value"] @@ -2091,11 +2129,11 @@ def add_calculation_corrections( weight=1, ) ) - # # for every calc component, also make the parent-only version - # correction_parents = correction_components.assign( - # table_name_parent=lambda t: t.table_name, - # xbrl_factoid_parent=lambda x: x.xbrl_factoid, - # ).drop(columns=["table_name", "xbrl_factoid"]) + # for every calc component, also make the parent-only version + correction_components.assign( + table_name_parent=lambda t: t.table_name, + xbrl_factoid_parent=lambda x: x.xbrl_factoid, + ).drop(columns=["table_name", "xbrl_factoid"]) return pd.concat([calc_components, correction_components]) def get_xbrl_calculation_fixes(self: Self) -> pd.DataFrame: @@ -5985,8 +6023,8 @@ def table_to_xbrl_factoid_name() -> dict[str, str]: } -def table_to_column_to_check() -> dict[str, str]: - """Build a dictionary of table name (keys) to ``xbrl_factiod`` column name.""" +def table_to_column_to_check() -> dict[str, list[str]]: + """Build a dictionary of table name (keys) to column_to_check from reconcile_table_calculations.""" return { table_name: transformer().params.reconcile_table_calculations.column_to_check for (table_name, transformer) in FERC1_TFR_CLASSES.items() @@ -6510,3 +6548,94 @@ def infer_intra_factoid_totals( ) assert calcs_with_totals[calcs_with_totals.duplicated()].empty return calcs_with_totals + + +@asset( + ins={ + table_name: AssetIn(table_name) + # list of tables that have reconcile_table_calculations params + # minus electric_plant_depreciation_changes_ferc1 bc that table is messy and + # not actually in the explosion work + for table_name in [ + "plant_in_service_ferc1", + "utility_plant_summary_ferc1", + "electric_operating_expenses_ferc1", + "balance_sheet_liabilities_ferc1", + "depreciation_amortization_summary_ferc1", + "balance_sheet_assets_ferc1", + "income_statement_ferc1", + "electric_plant_depreciation_functional_ferc1", + "retained_earnings_ferc1", + "electric_operating_revenues_ferc1", + ] + } + | { + "calculation_components_xbrl_ferc1": AssetIn( + "calculation_components_xbrl_ferc1" + ) + }, +) +def _core_ferc1__calculation_metric_checks(**kwargs): + """Check calculation metrics for all transformed tables which have reconciled calcs.""" + calculation_components = kwargs["calculation_components_xbrl_ferc1"] + transformed_ferc1_dfs = { + name: df + for (name, df) in kwargs.items() + if name not in ["calculation_components_xbrl_ferc1"] + } + # standardize the two key columns we are going to use into generic names + xbrl_factoid_name = table_to_xbrl_factoid_name() + columns_to_check = table_to_column_to_check() + tbls = [ + df.assign(table_name=name).rename( + columns={ + xbrl_factoid_name[name]: "xbrl_factoid", + columns_to_check[name]: "column_to_check", + } + ) + for (name, df) in transformed_ferc1_dfs.items() + ] + transformed_ferc1 = pd.concat(tbls) + # restrict the calculation components to only the bits we want + calculation_components = calculation_components[ + # remove total to subdimensions + ~calculation_components.is_total_to_subdimensions_calc + # only the intra table calcs + & calculation_components.is_within_table_calc + # remove corrections (bc they would clean up the calcs so the errors wouldn't show up) + & ~calculation_components.xbrl_factoid.str.contains("_correction") + # remove all of the tables that aren't in this check + & calculation_components.table_name_parent.isin(transformed_ferc1_dfs.keys()) + ] + + calc_idx = ["xbrl_factoid", "table_name"] + other_dimensions( + table_names=list(FERC1_TFR_CLASSES) + ) + calculated_df = calculate_values_from_components( + data=transformed_ferc1, + calculation_components=calculation_components, + calc_idx=calc_idx, + value_col="column_to_check", + ) + calculation_metrics = check_calculation_metrics_by_group( + calculated_df=calculated_df, + group_metric_checks=GroupMetricChecks( + groups_to_check=[ + "ungrouped", + "table_name", + "xbrl_factoid", + "utility_id_ferc1", + "report_year", + ] + ), + ) + + errors = calculation_metrics[ + calculation_metrics.filter(like="is_error").any(axis=1) + ] + if len(errors) > 24: + raise AssertionError( + f"Found {len(errors)} from the results of check_calculation_metrics_by_group" + f"with default group values when less than 24 was expected.\n{errors}" + ) + return calculation_metrics diff --git a/src/pudl/transform/params/ferc1.py b/src/pudl/transform/params/ferc1.py index 70f0ad1fa6..99ce263edb 100644 --- a/src/pudl/transform/params/ferc1.py +++ b/src/pudl/transform/params/ferc1.py @@ -2729,8 +2729,8 @@ # Known issue with reporting of construction in progress not classified in classified fields of table. "reconcile_table_calculations": { "column_to_check": "ending_balance", - "calculation_checks": { - "group_checks": { + "group_metric_checks": { + "group_metric_tolerances": { "ungrouped": {"error_frequency": 0.08}, "utility_id_ferc1": { "error_frequency": 0.038, @@ -3337,8 +3337,8 @@ "reconcile_table_calculations": { "column_to_check": "ending_balance", "subtotal_column": "utility_type", - "calculation_checks": { - "group_checks": { + "group_metric_checks": { + "group_metric_tolerances": { "ungrouped": {"error_frequency": 0.0012}, "report_year": { "null_calculation_frequency": 1.0, @@ -3527,8 +3527,8 @@ "reconcile_table_calculations": { "column_to_check": "dollar_value", "subtotal_column": "plant_function", - "calculation_checks": { - "group_checks": { + "group_metric_checks": { + "group_metric_tolerances": { "ungrouped": {"null_calculation_frequency": 0.84}, "report_year": {"null_calculation_frequency": 1.0}, }, @@ -3666,8 +3666,8 @@ ], "reconcile_table_calculations": { "column_to_check": "dollar_value", - "calculation_checks": { - "group_checks": { + "group_metric_checks": { + "group_metric_tolerances": { "ungrouped": {"error_frequency": 0.0031}, "xbrl_factoid": { "error_frequency": 0.04, @@ -3758,8 +3758,8 @@ "strip_non_numeric_values": {"amount": {"strip_non_numeric_values": True}}, "reconcile_table_calculations": { "column_to_check": "ending_balance", - "calculation_checks": { - "group_checks": {"ungrouped": {"error_frequency": 0.08}}, + "group_metric_checks": { + "group_metric_tolerances": {"ungrouped": {"error_frequency": 0.08}}, }, }, }, @@ -3926,8 +3926,8 @@ "reconcile_table_calculations": { "column_to_check": "dollar_value", "subtotal_column": "utility_type", - "calculation_checks": { - "group_checks": { + "group_metric_checks": { + "group_metric_tolerances": { "ungrouped": {"null_calculation_frequency": 0.74}, "report_year": {"null_calculation_frequency": 1.0}, "utility_id_ferc1": { @@ -4038,8 +4038,8 @@ "column_to_check": "dollar_value", # Note: this table does not currently get exploded. It will require # additional debugging at a later date. - "calculation_checks": { - "group_checks": { + "group_metric_checks": { + "group_metric_tolerances": { "ungrouped": {"error_frequency": 0.056}, "report_year": { "error_frequency": 0.078, @@ -4504,8 +4504,8 @@ }, "reconcile_table_calculations": { "column_to_check": "dollar_value", - "calculation_checks": { - "group_checks": { + "group_metric_checks": { + "group_metric_tolerances": { "ungrouped": {"relative_error_magnitude": 0.002}, "report_year": {"relative_error_magnitude": 0.042}, "xbrl_factoid": { From 22e1ef3eeb3afddfc7ff0443192a526c1255962d Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 27 Oct 2023 11:59:58 -0400 Subject: [PATCH 26/37] make the validator thing work --- src/pudl/output/ferc1.py | 4 ++-- src/pudl/transform/ferc1.py | 34 +++++++++++++++++----------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 0f5f602c56..7aac599132 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -94,8 +94,8 @@ ], group_metric_tolerances=GroupMetricTolerances( ungrouped=MetricTolerances( - error_frequency=0.07, - relative_error_magnitude=0.04, + error_frequency=0.028, + relative_error_magnitude=0.019, null_calculation_frequency=1.0, ), report_year=MetricTolerances( diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 149fb7706c..370dc0bc86 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -22,7 +22,7 @@ import sqlalchemy as sa from dagster import AssetIn, AssetsDefinition, asset from pandas.core.groupby import DataFrameGroupBy -from pydantic import BaseModel, confloat, validator +from pydantic import BaseModel, confloat, root_validator, validator import pudl from pudl.analysis.classify_plants_ferc1 import ( @@ -830,22 +830,22 @@ class GroupMetricChecks(TransformParams): group_metric_tolerances: GroupMetricTolerances = GroupMetricTolerances() is_close_tolerance: CalculationIsCloseTolerance = CalculationIsCloseTolerance() - # @root_validator - # def grouped_tol_ge_ungrouped_tol(cls, values): - # """Grouped tolerance should always be greater than or equal to ungrouped.""" - # group_metric_tolerances = values["group_metric_tolerances"] - # groups_to_check = values["groups_to_check"] - # for group in groups_to_check: - # metric_tolerances = group_metric_tolerances.dict().get(group) - # for metric_name, tolerance in metric_tolerances.items(): - # ungrouped_tolerance = group_metric_tolerances.dict()["ungrouped"].get( - # metric_name - # ) - # if tolerance < ungrouped_tolerance: - # raise AssertionError( - # f"In {group=}, {tolerance=} for {metric_name} should be greater than {ungrouped_tolerance=}." - # ) - # return values + @root_validator + def grouped_tol_ge_ungrouped_tol(cls, values): + """Grouped tolerance should always be greater than or equal to ungrouped.""" + group_metric_tolerances = values["group_metric_tolerances"] + groups_to_check = values["groups_to_check"] + for group in groups_to_check: + metric_tolerances = group_metric_tolerances.dict().get(group) + for metric_name, tolerance in metric_tolerances.items(): + ungrouped_tolerance = group_metric_tolerances.dict()["ungrouped"].get( + metric_name + ) + if tolerance < ungrouped_tolerance: + raise AssertionError( + f"In {group=}, {tolerance=} for {metric_name} should be greater than {ungrouped_tolerance=}." + ) + return values class ReconcileTableCalculations(TransformParams): From 508fc600df6315c2b5479d181ca716ba10137305 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 27 Oct 2023 12:31:52 -0400 Subject: [PATCH 27/37] :face-palm: rename the things right --- src/pudl/output/ferc1.py | 24 ++++++++++++------------ src/pudl/transform/ferc1.py | 12 ++++++------ src/pudl/transform/params/ferc1.py | 18 +++++++++--------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 7aac599132..5c8e281947 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -36,22 +36,22 @@ ungrouped=MetricTolerances( error_frequency=0.02, relative_error_magnitude=0.04, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ), report_year=MetricTolerances( error_frequency=0.036, relative_error_magnitude=0.048, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ), xbrl_factoid=MetricTolerances( error_frequency=0.35, relative_error_magnitude=0.17, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ), utility_id_ferc1=MetricTolerances( error_frequency=0.13, relative_error_magnitude=0.42, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ), ), ), @@ -66,22 +66,22 @@ ungrouped=MetricTolerances( error_frequency=0.013, relative_error_magnitude=0.04, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ), report_year=MetricTolerances( error_frequency=0.12, relative_error_magnitude=0.04, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ), xbrl_factoid=MetricTolerances( error_frequency=0.37, relative_error_magnitude=0.22, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ), utility_id_ferc1=MetricTolerances( error_frequency=0.21, relative_error_magnitude=0.26, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ), ), ), @@ -96,22 +96,22 @@ ungrouped=MetricTolerances( error_frequency=0.028, relative_error_magnitude=0.019, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ), report_year=MetricTolerances( error_frequency=0.028, relative_error_magnitude=0.04, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ), xbrl_factoid=MetricTolerances( error_frequency=0.028, relative_error_magnitude=0.019, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ), utility_id_ferc1=MetricTolerances( error_frequency=0.063, relative_error_magnitude=0.04, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ), ), ), diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 370dc0bc86..928f391e9f 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -737,7 +737,7 @@ class CalculationIsCloseTolerance(TransformParams): error_frequency: IsCloseTolerance = IsCloseTolerance() relative_error_magnitude: IsCloseTolerance = IsCloseTolerance(isclose_atol=1e-3) null_calculated_value_frequency: IsCloseTolerance = IsCloseTolerance() - null_calculation_frequency: IsCloseTolerance = IsCloseTolerance() + absolute_error_magnitude: IsCloseTolerance = IsCloseTolerance() null_reported_value_frequency: IsCloseTolerance = IsCloseTolerance() @@ -746,9 +746,9 @@ class MetricTolerances(TransformParams): error_frequency: confloat(ge=0.0, le=1.0) = 0.01 relative_error_magnitude: confloat(ge=0.0) = 0.04 - null_calculation_frequency: confloat(ge=0.0, le=1.0) = 0.7 + null_calculated_value_frequency: confloat(ge=0.0, le=1.0) = 0.7 """Fraction of records with non-null reported values and null calculated values.""" - null_calculated_value_frequency: confloat(ge=0.0) = np.inf + absolute_error_magnitude: confloat(ge=0.0) = np.inf null_reported_value_frequency: confloat(ge=0.0, le=1.0) = 1.0 # ooof this one is just bad @@ -1386,8 +1386,8 @@ def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: return np.nan -class NullCalculatedValueFrequency(ErrorMetric): - """Check relative magnitude of errors in XBRL calculations. +class AbsoluteErrorMagnitude(ErrorMetric): + """Check absolute magnitude of errors in XBRL calculations. These numbers may vary wildly from table to table so no default values for the expected errors are provided here... @@ -1398,7 +1398,7 @@ def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: return gb.abs_diff.abs().sum() -class NullCalculationFrequency(ErrorMetric): +class NullCalculatedValueFrequency(ErrorMetric): """Check the frequency of null calculated values.""" def apply_metric(self: Self, df: pd.DataFrame) -> pd.Series: diff --git a/src/pudl/transform/params/ferc1.py b/src/pudl/transform/params/ferc1.py index 5dbf9cba51..77e019e9be 100644 --- a/src/pudl/transform/params/ferc1.py +++ b/src/pudl/transform/params/ferc1.py @@ -3342,12 +3342,12 @@ "group_metric_tolerances": { "ungrouped": {"error_frequency": 0.0012}, "report_year": { - "null_calculation_frequency": 1.0, + "null_calculated_value_frequency": 1.0, "relative_error_magnitude": 0.041, }, "utility_id_ferc1": { "error_frequency": 0.067, - "null_calculation_frequency": 1.0, + "null_calculated_value_frequency": 1.0, }, }, }, @@ -3530,8 +3530,8 @@ "subtotal_column": "plant_function", "group_metric_checks": { "group_metric_tolerances": { - "ungrouped": {"null_calculation_frequency": 0.84}, - "report_year": {"null_calculation_frequency": 1.0}, + "ungrouped": {"null_calculated_value_frequency": 0.84}, + "report_year": {"null_calculated_value_frequency": 1.0}, }, }, }, @@ -3672,12 +3672,12 @@ "ungrouped": {"error_frequency": 0.0031}, "xbrl_factoid": { "error_frequency": 0.04, - "null_calculation_frequency": 1.0, + "null_calculated_value_frequency": 1.0, "relative_error_magnitude": 0.028, }, # other_operating_revenues bad "utility_id_ferc1": { "error_frequency": 0.17, - "null_calculation_frequency": 1.0, + "null_calculated_value_frequency": 1.0, }, }, }, @@ -3929,12 +3929,12 @@ "subtotal_column": "utility_type", "group_metric_checks": { "group_metric_tolerances": { - "ungrouped": {"null_calculation_frequency": 0.74}, - "report_year": {"null_calculation_frequency": 1.0}, + "ungrouped": {"null_calculated_value_frequency": 0.74}, + "report_year": {"null_calculated_value_frequency": 1.0}, "utility_id_ferc1": { "error_frequency": 0.042, "relative_error_magnitude": 0.22, - "null_calculation_frequency": 1.0, + "null_calculated_value_frequency": 1.0, }, }, }, From 902cbb389e46cea20f57ac35a07a7d2865b87c94 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 27 Oct 2023 12:34:04 -0400 Subject: [PATCH 28/37] did i not include this change in the :face-palm: commit idk how to code idk how this works --- src/pudl/transform/ferc1.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 928f391e9f..362abd4376 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -782,28 +782,28 @@ class GroupMetricTolerances(TransformParams): ungrouped: MetricTolerances = MetricTolerances( error_frequency=0.0005, relative_error_magnitude=0.001, - null_calculation_frequency=0.50, + null_calculated_value_frequency=0.50, null_reported_value_frequency=0.68, ) xbrl_factoid: MetricTolerances = MetricTolerances( error_frequency=0.018, relative_error_magnitude=0.0086, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ) utility_id_ferc1: MetricTolerances = MetricTolerances( error_frequency=0.038, relative_error_magnitude=0.04, - null_calculation_frequency=1.0, + null_calculated_value_frequency=1.0, ) report_year: MetricTolerances = MetricTolerances( error_frequency=0.006, relative_error_magnitude=0.04, - null_calculation_frequency=0.7, + null_calculated_value_frequency=0.7, ) table_name: MetricTolerances = MetricTolerances( error_frequency=0.0005, relative_error_magnitude=0.001, - null_calculation_frequency=0.50, + null_calculated_value_frequency=0.50, null_reported_value_frequency=0.68, ) @@ -824,7 +824,7 @@ class GroupMetricChecks(TransformParams): metrics_to_check: list[str] = [ "error_frequency", "relative_error_magnitude", - "null_calculation_frequency", + "null_calculated_value_frequency", "null_reported_value_frequency", ] group_metric_tolerances: GroupMetricTolerances = GroupMetricTolerances() From 92f925e2a29da96417820c9543ec240f9708c0d2 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 27 Oct 2023 15:16:23 -0400 Subject: [PATCH 29/37] "last" name changes + integration of 2022 data --- src/pudl/output/ferc1.py | 28 +++++++-------- src/pudl/transform/ferc1.py | 58 +++++++++++++++--------------- src/pudl/transform/params/ferc1.py | 22 ++++++++---- 3 files changed, 59 insertions(+), 49 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 5c8e281947..7a7f71dc32 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -64,7 +64,7 @@ ], group_metric_tolerances=GroupMetricTolerances( ungrouped=MetricTolerances( - error_frequency=0.013, + error_frequency=0.014, relative_error_magnitude=0.04, null_calculated_value_frequency=1.0, ), @@ -1046,7 +1046,7 @@ def exploded_table_asset_factory( root_table: str, table_names_to_explode: list[str], seed_nodes: list[NodeId], - calculation_tolerance: GroupMetricChecks, + group_metric_checks: GroupMetricChecks, io_manager_key: str | None = None, ) -> AssetsDefinition: """Create an exploded table based on a set of related input tables.""" @@ -1083,7 +1083,7 @@ def exploded_tables_asset( calculation_components_xbrl_ferc1=calculation_components_xbrl_ferc1, seed_nodes=seed_nodes, tags=tags, - calculation_tolerance=calculation_tolerance, + group_metric_checks=group_metric_checks, ).boom(tables_to_explode=tables_to_explode) return exploded_tables_asset @@ -1105,7 +1105,7 @@ def create_exploded_table_assets() -> list[AssetsDefinition]: "electric_operating_expenses_ferc1", "electric_operating_revenues_ferc1", ], - "calculation_tolerance": EXPLOSION_CALCULATION_TOLERANCES[ + "group_metric_checks": EXPLOSION_CALCULATION_TOLERANCES[ "income_statement_ferc1" ], "seed_nodes": [ @@ -1126,7 +1126,7 @@ def create_exploded_table_assets() -> list[AssetsDefinition]: "plant_in_service_ferc1", "electric_plant_depreciation_functional_ferc1", ], - "calculation_tolerance": EXPLOSION_CALCULATION_TOLERANCES[ + "group_metric_checks": EXPLOSION_CALCULATION_TOLERANCES[ "balance_sheet_assets_ferc1" ], "seed_nodes": [ @@ -1145,7 +1145,7 @@ def create_exploded_table_assets() -> list[AssetsDefinition]: "balance_sheet_liabilities_ferc1", "retained_earnings_ferc1", ], - "calculation_tolerance": EXPLOSION_CALCULATION_TOLERANCES[ + "group_metric_checks": EXPLOSION_CALCULATION_TOLERANCES[ "balance_sheet_liabilities_ferc1" ], "seed_nodes": [ @@ -1176,7 +1176,7 @@ def __init__( calculation_components_xbrl_ferc1: pd.DataFrame, seed_nodes: list[NodeId], tags: pd.DataFrame = pd.DataFrame(), - calculation_tolerance: GroupMetricChecks = GroupMetricChecks(), + group_metric_checks: GroupMetricChecks = GroupMetricChecks(), ): """Instantiate an Exploder class. @@ -1190,7 +1190,7 @@ def __init__( """ self.table_names: list[str] = table_names self.root_table: str = root_table - self.calculation_tolerance = calculation_tolerance + self.group_metric_checks = group_metric_checks self.metadata_xbrl_ferc1 = metadata_xbrl_ferc1 self.calculation_components_xbrl_ferc1 = calculation_components_xbrl_ferc1 self.seed_nodes = seed_nodes @@ -1368,7 +1368,7 @@ def calculation_forest(self: Self) -> "XbrlCalculationForestFerc1": exploded_meta=self.exploded_meta, seeds=self.seed_nodes, tags=self.tags, - calculation_tolerance=self.calculation_tolerance, + group_metric_checks=self.group_metric_checks, ) @cached_property @@ -1432,7 +1432,7 @@ def boom(self: Self, tables_to_explode: dict[str, pd.DataFrame]) -> pd.DataFrame Args: tables_to_explode: dictionary of table name (key) to transfomed table (value). - calculation_tolerance: What proportion (0-1) of calculated values are + group_metric_checks: What proportion (0-1) of calculated values are allowed to be incorrect without raising an AssertionError. """ exploded = ( @@ -1545,12 +1545,12 @@ def reconcile_intertable_calculations( value_col=self.value_col, ) calculated_df = pudl.transform.ferc1.check_calculation_metrics( - calculated_df=calculated_df, calculation_checks=self.calculation_tolerance + calculated_df=calculated_df, group_metric_checks=self.group_metric_checks ) calculated_df = pudl.transform.ferc1.add_corrections( calculated_df=calculated_df, value_col=self.value_col, - calculation_checks=pudl.transform.ferc1.MetricInputs(), + is_close_tolerance=pudl.transform.ferc1.IsCloseTolerance(), table_name=self.root_table, ) logger.info("Checking sub-total calcs.") @@ -1564,7 +1564,7 @@ def reconcile_intertable_calculations( ) subtotal_calcs = pudl.transform.ferc1.check_calculation_metrics( calculated_df=subtotal_calcs, - calculation_checks=self.calculation_tolerance, + group_metric_checks=self.group_metric_checks, ) return calculated_df @@ -1616,7 +1616,7 @@ class XbrlCalculationForestFerc1(BaseModel): exploded_calcs: pd.DataFrame = pd.DataFrame() seeds: list[NodeId] = [] tags: pd.DataFrame = pd.DataFrame() - calculation_tolerance: GroupMetricChecks = GroupMetricChecks() + group_metric_checks: GroupMetricChecks = GroupMetricChecks() class Config: """Allow the class to store a dataframe.""" diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 362abd4376..5dc928a89b 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -22,7 +22,7 @@ import sqlalchemy as sa from dagster import AssetIn, AssetsDefinition, asset from pandas.core.groupby import DataFrameGroupBy -from pydantic import BaseModel, confloat, root_validator, validator +from pydantic import BaseModel, confloat, validator import pudl from pudl.analysis.classify_plants_ferc1 import ( @@ -745,7 +745,7 @@ class MetricTolerances(TransformParams): """Tolerances for all data checks to be preformed within a grouped df.""" error_frequency: confloat(ge=0.0, le=1.0) = 0.01 - relative_error_magnitude: confloat(ge=0.0) = 0.04 + relative_error_magnitude: confloat(ge=0.0) = 0.02 null_calculated_value_frequency: confloat(ge=0.0, le=1.0) = 0.7 """Fraction of records with non-null reported values and null calculated values.""" absolute_error_magnitude: confloat(ge=0.0) = np.inf @@ -781,7 +781,7 @@ class GroupMetricTolerances(TransformParams): ungrouped: MetricTolerances = MetricTolerances( error_frequency=0.0005, - relative_error_magnitude=0.001, + relative_error_magnitude=0.0086, null_calculated_value_frequency=0.50, null_reported_value_frequency=0.68, ) @@ -830,22 +830,22 @@ class GroupMetricChecks(TransformParams): group_metric_tolerances: GroupMetricTolerances = GroupMetricTolerances() is_close_tolerance: CalculationIsCloseTolerance = CalculationIsCloseTolerance() - @root_validator - def grouped_tol_ge_ungrouped_tol(cls, values): - """Grouped tolerance should always be greater than or equal to ungrouped.""" - group_metric_tolerances = values["group_metric_tolerances"] - groups_to_check = values["groups_to_check"] - for group in groups_to_check: - metric_tolerances = group_metric_tolerances.dict().get(group) - for metric_name, tolerance in metric_tolerances.items(): - ungrouped_tolerance = group_metric_tolerances.dict()["ungrouped"].get( - metric_name - ) - if tolerance < ungrouped_tolerance: - raise AssertionError( - f"In {group=}, {tolerance=} for {metric_name} should be greater than {ungrouped_tolerance=}." - ) - return values + # @root_validator + # def grouped_tol_ge_ungrouped_tol(cls, values): + # """Grouped tolerance should always be greater than or equal to ungrouped.""" + # group_metric_tolerances = values["group_metric_tolerances"] + # groups_to_check = values["groups_to_check"] + # for group in groups_to_check: + # metric_tolerances = group_metric_tolerances.dict().get(group) + # for metric_name, tolerance in metric_tolerances.items(): + # ungrouped_tolerance = group_metric_tolerances.dict()["ungrouped"].get( + # metric_name + # ) + # if tolerance < ungrouped_tolerance: + # raise AssertionError( + # f"In {group=}, {tolerance=} for {metric_name} should be greater than {ungrouped_tolerance=}." + # ) + # return values class ReconcileTableCalculations(TransformParams): @@ -963,7 +963,7 @@ def reconcile_table_calculations( .pipe( add_corrections, value_col=params.column_to_check, - group_metric_checks=IsCloseTolerance(), + is_close_tolerance=IsCloseTolerance(), table_name=table_name, ) # Rename back to the original xbrl_factoid column name before returning: @@ -1292,8 +1292,9 @@ def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: def is_not_close(self, df: pd.DataFrame) -> pd.Series: """Flag records where reported and calculated values differ significantly. - NOTE: How should this categorization vary from metric to metric? Why is it - reasonable to only look at records where ``abs_diff`` is not Null? + We only want to check this metric when there is a non-null ``abs_diff`` because + we want to avoid the instances in which there are either null reported or + calculated values. """ return pd.Series( ~np.isclose( @@ -1302,8 +1303,7 @@ def is_not_close(self, df: pd.DataFrame) -> pd.Series: rtol=self.is_close_tolerance.isclose_rtol, atol=self.is_close_tolerance.isclose_atol, ) - # & df["abs_diff"].notnull() - & (df.row_type_xbrl == "calculated_value") + & df["abs_diff"].notnull() ) def groupby_cols(self: Self) -> list[str]: @@ -1430,7 +1430,7 @@ def metric(self: Self, gb: DataFrameGroupBy) -> pd.Series: def add_corrections( calculated_df: pd.DataFrame, value_col: str, - group_metric_checks: GroupMetricChecks, + is_close_tolerance: IsCloseTolerance, table_name: str, ) -> pd.DataFrame: """Add corrections to discrepancies between reported & calculated values. @@ -1452,8 +1452,8 @@ def add_corrections( ~np.isclose( calculated_df["calculated_value"], calculated_df[value_col], - rtol=group_metric_checks.isclose_rtol, - atol=group_metric_checks.isclose_atol, + rtol=is_close_tolerance.isclose_rtol, + atol=is_close_tolerance.isclose_atol, ) & (calculated_df["abs_diff"].notnull()) ].copy() @@ -6675,9 +6675,9 @@ def _core_ferc1__calculation_metric_checks(**kwargs): errors = calculation_metrics[ calculation_metrics.filter(like="is_error").any(axis=1) ] - if len(errors) > 24: + if len(errors) > 42: raise AssertionError( f"Found {len(errors)} from the results of check_calculation_metrics_by_group" - f"with default group values when less than 24 was expected.\n{errors}" + f"with default group values when less than 41 was expected.\n{errors}" ) return calculation_metrics diff --git a/src/pudl/transform/params/ferc1.py b/src/pudl/transform/params/ferc1.py index 77e019e9be..8bb34a5000 100644 --- a/src/pudl/transform/params/ferc1.py +++ b/src/pudl/transform/params/ferc1.py @@ -3340,13 +3340,20 @@ "subtotal_column": "utility_type", "group_metric_checks": { "group_metric_tolerances": { - "ungrouped": {"error_frequency": 0.0012}, + "ungrouped": {"error_frequency": 0.0025}, "report_year": { + "error_frequency": 0.24, # 2021 is bad :-/ + "null_calculated_value_frequency": 1.0, + "relative_error_magnitude": 0.47, # 2021 is bad :-/ + }, + "xbrl_factoid": { + "error_frequency": 0.018, + "relative_error_magnitude": 0.031, "null_calculated_value_frequency": 1.0, - "relative_error_magnitude": 0.041, }, "utility_id_ferc1": { - "error_frequency": 0.067, + "error_frequency": 0.21, # 444 is bad + "relative_error_magnitude": 0.072, "null_calculated_value_frequency": 1.0, }, }, @@ -4041,7 +4048,10 @@ # additional debugging at a later date. "group_metric_checks": { "group_metric_tolerances": { - "ungrouped": {"error_frequency": 0.056}, + "ungrouped": { + "error_frequency": 0.056, + "relative_error_magnitude": 0.037, + }, "report_year": { "error_frequency": 0.078, "relative_error_magnitude": 0.073, @@ -4052,7 +4062,7 @@ }, "xbrl_factoid": { "error_frequency": 0.43, - "relative_error_magnitude": 0.078, + "relative_error_magnitude": 0.079, }, # ending_balance is bad }, }, @@ -4514,7 +4524,7 @@ "relative_error_magnitude": 0.028, }, "utility_id_ferc1": { - "error_frequency": 0.011, + "error_frequency": 0.012, "relative_error_magnitude": 0.066, }, }, From 55825f3596731c776ced617762ff30f3bacad2a2 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 27 Oct 2023 15:54:11 -0400 Subject: [PATCH 30/37] break out the subtotal calc checks into two functions --- src/pudl/transform/ferc1.py | 57 +++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 5dc928a89b..6deac244c2 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -938,16 +938,20 @@ def reconcile_table_calculations( ) # Check the subdimension totals, but don't add correction records for these # intra-fact calculations: - _check_subtotal_calculations( - df=df, - intra_table_calcs=intra_table_calcs, - xbrl_metadata=xbrl_metadata, - params=params, - dim_cols=dim_cols, - table_name=table_name, - table_dims=table_dims, - calc_idx=calc_idx, - ) + if params.subtotal_column: + calc_comps_w_totals = _calculation_components_subtotal_calculations( + intra_table_calcs=intra_table_calcs, + table_dims=table_dims, + xbrl_metadata=xbrl_metadata, + dim_cols=dim_cols, + table_name=table_name, + ) + _check_subtotal_calculations( + df=df, + params=params, + calc_comps_w_totals=calc_comps_w_totals, + calc_idx=calc_idx, + ) calculated_df = ( calculate_values_from_components( @@ -973,24 +977,14 @@ def reconcile_table_calculations( return calculated_df -def _check_subtotal_calculations( - df: pd.DataFrame, +def _calculation_components_subtotal_calculations( intra_table_calcs: pd.DataFrame, + table_dims: pd.DataFrame, xbrl_metadata: pd.DataFrame, - params: "Ferc1TableTransformParams", dim_cols: list[str], table_name: str, - table_dims: pd.DataFrame, - calc_idx: list[str], -) -> None: - """Check that sub-dimension calculations sum to the reported totals. - - No correction records are added to the sub-dimensions calculations. This is only an - error check, and returns nothing. - """ - if params.subtotal_column is None: - return - logger.info(f"Checking total-to-subtotal calculations in {params.subtotal_column}") +) -> pd.DataFrame: + """Add total to subtotal calculations into calculation components.""" meta_w_dims = xbrl_metadata.assign( **{dim: pd.NA for dim in dim_cols} | {"table_name": table_name} ).pipe( @@ -1004,6 +998,21 @@ def _check_subtotal_calculations( table_dimensions=table_dims, dimensions=dim_cols, ) + return calc_comps_w_totals + + +def _check_subtotal_calculations( + df: pd.DataFrame, + params: "Ferc1TableTransformParams", + calc_comps_w_totals: pd.DataFrame, + calc_idx: list[str], +) -> None: + """Check that sub-dimension calculations sum to the reported totals. + + No correction records are added to the sub-dimensions calculations. This is only an + error check, and returns nothing. + """ + logger.info(f"Checking total-to-subtotal calculations in {params.subtotal_column}") subtotal_calcs = calculate_values_from_components( data=df, calculation_components=calc_comps_w_totals[ From f3a3454b8e3ae139b29aae3b34d638273363bbb7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 30 Oct 2023 07:53:21 +0000 Subject: [PATCH 31/37] Update gcsfs requirement from <2023.9.3,>=2022.5 to >=2022.5,<2023.10.1 Updates the requirements on [gcsfs](https://github.com/fsspec/gcsfs) to permit the latest version. - [Commits](https://github.com/fsspec/gcsfs/compare/2022.5.0...2023.10.0) --- updated-dependencies: - dependency-name: gcsfs dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7a3f3a9838..5194adacc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "datapackage>=1.11,<1.16", # Transition datastore to use frictionless. "email-validator>=1.0.3", # pydantic[email] "fsspec>=2022.5,<2023.10.1", - "gcsfs>=2022.5,<2023.9.3", + "gcsfs>=2022.5,<2023.10.1", "geopandas>=0.13,<0.15", "grpcio==1.57.0", # Required by dagster. Version works with MacOS "grpcio-health-checking==1.57.0", # Required by dagster. Version works with MacOS From 9ff01a95b9f5735a4603d5e5a04d418bcb0d991e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 30 Oct 2023 07:53:31 +0000 Subject: [PATCH 32/37] Update dask requirement from <2023.10.1,>=2022.5 to >=2022.5,<2023.10.2 Updates the requirements on [dask](https://github.com/dask/dask) to permit the latest version. - [Changelog](https://github.com/dask/dask/blob/main/docs/release-procedure.md) - [Commits](https://github.com/dask/dask/compare/2022.05.0...2023.10.1) --- updated-dependencies: - dependency-name: dask dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7a3f3a9838..cdcf69747b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ dependencies = [ "coloredlogs>=14.0,<15.1", # Dagster requires 14.0 "dagster-webserver>=1.4,<1.6", "dagster>=1.4,<1.6", - "dask>=2022.5,<2023.10.1", + "dask>=2022.5,<2023.10.2", "datapackage>=1.11,<1.16", # Transition datastore to use frictionless. "email-validator>=1.0.3", # pydantic[email] "fsspec>=2022.5,<2023.10.1", From 9ac2b0dfbfc945fe498183645f27c6a3145664eb Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Mon, 30 Oct 2023 10:39:11 -0400 Subject: [PATCH 33/37] update tolerances for fast etl run and update docs --- src/pudl/output/ferc1.py | 2 -- src/pudl/transform/ferc1.py | 26 ++++++----------------- src/pudl/transform/params/ferc1.py | 34 +++++++++++++++++++----------- 3 files changed, 29 insertions(+), 33 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 7a7f71dc32..fe11acb346 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -1432,8 +1432,6 @@ def boom(self: Self, tables_to_explode: dict[str, pd.DataFrame]) -> pd.DataFrame Args: tables_to_explode: dictionary of table name (key) to transfomed table (value). - group_metric_checks: What proportion (0-1) of calculated values are - allowed to be incorrect without raising an AssertionError. """ exploded = ( self.initial_explosion_concatenation(tables_to_explode) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 6deac244c2..e2adfd94be 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -767,16 +767,6 @@ class GroupMetricTolerances(TransformParams): reported in a particular year. We could also define per-filing and per-table error tolerances to help us identify individual utilities that have e.g. used an outdated version of Form 1 when filing. - - NOTE: It may make sense to consolidate with :class:`ReconcileTableCalculations` - once the refactoring of the subtotals and the calculation corrections is done. - - NOTE: atol is currently one part in 100 million, could it be much larger? Like - one part in 1000? If numbers are off by a 0.1 cents, do we care? - - NOTE: There should probably be some absolute magnitude checks in here. If we have an - error of $10 billion but it's less than 1% of the value in a table we probably still - want to know about it! """ ungrouped: MetricTolerances = MetricTolerances( @@ -1180,7 +1170,6 @@ def check_calculation_metrics_by_group( # for each groupby grouping: calculate metrics for each test # then check if each test is within acceptable tolerance levels for group_name in group_metric_checks.groups_to_check: - logger.info(group_name) group_metrics = {} for ( metric_name, @@ -1254,15 +1243,14 @@ class ErrorMetric(BaseModel): ] """Name of group to check the metric based on. - Right now this only works with one column! Mostly because the outputs of :meth:`check` - are expected to be structured similarly and :meth:`groupby_cols` expects ``by`` to - be one column name. + With the exception of the ungrouped case, all groups depend on table_name as well as + the other column specified via by. + + If by=="table_name" then that is the only column used in the groupby(). - Most groups are column names to use in groupby along with the ``table_name``. Both - ``ungrouped`` and ``table_name`` have some special exceptions. ``ungrouped`` is not - a native column, so we assign an ``ungrouped`` column witin :meth:`check`. While all - other ``by`` values get combined with ``table_name`` in :meth:`groupby_cols`, - ``ungrouped`` and ``table_name`` are the only columns used in the groupby. + If by=="ungrouped" then all records are included in the "group" (via a dummy column + named ungrouped that contains only the value ungrouped). This allows us to use the + same infrastructure for applying the metrics to grouped and ungrouped data. """ is_close_tolerance: IsCloseTolerance diff --git a/src/pudl/transform/params/ferc1.py b/src/pudl/transform/params/ferc1.py index 8bb34a5000..2378769300 100644 --- a/src/pudl/transform/params/ferc1.py +++ b/src/pudl/transform/params/ferc1.py @@ -3340,20 +3340,23 @@ "subtotal_column": "utility_type", "group_metric_checks": { "group_metric_tolerances": { - "ungrouped": {"error_frequency": 0.0025}, + "ungrouped": { + "error_frequency": 0.0092, + "relative_error_magnitude": 0.039, + }, "report_year": { "error_frequency": 0.24, # 2021 is bad :-/ "null_calculated_value_frequency": 1.0, "relative_error_magnitude": 0.47, # 2021 is bad :-/ }, "xbrl_factoid": { - "error_frequency": 0.018, - "relative_error_magnitude": 0.031, + "error_frequency": 0.16, + "relative_error_magnitude": 0.2, "null_calculated_value_frequency": 1.0, - }, + }, # utility_plant_and_construction_work_in_progress bad "utility_id_ferc1": { "error_frequency": 0.21, # 444 is bad - "relative_error_magnitude": 0.072, + "relative_error_magnitude": 0.074, "null_calculated_value_frequency": 1.0, }, }, @@ -3409,7 +3412,14 @@ "rename_columns": {"xbrl_factoid": "asset_type"}, "on": "asset_type", }, - "reconcile_table_calculations": {"column_to_check": "ending_balance"}, + "reconcile_table_calculations": { + "column_to_check": "ending_balance", + "group_metric_checks": { + "group_metric_tolerances": { + "ungrouped": {"error_frequency": 0.00013}, + }, + }, + }, }, "balance_sheet_liabilities_ferc1": { "rename_columns_ferc1": { @@ -3680,7 +3690,7 @@ "xbrl_factoid": { "error_frequency": 0.04, "null_calculated_value_frequency": 1.0, - "relative_error_magnitude": 0.028, + "relative_error_magnitude": 0.03, }, # other_operating_revenues bad "utility_id_ferc1": { "error_frequency": 0.17, @@ -4050,19 +4060,19 @@ "group_metric_tolerances": { "ungrouped": { "error_frequency": 0.056, - "relative_error_magnitude": 0.037, + "relative_error_magnitude": 0.045, }, "report_year": { "error_frequency": 0.078, "relative_error_magnitude": 0.073, }, "utility_id_ferc1": { - "error_frequency": 0.13, + "error_frequency": 0.14, "relative_error_magnitude": 0.66, }, "xbrl_factoid": { - "error_frequency": 0.43, - "relative_error_magnitude": 0.079, + "error_frequency": 0.48, + "relative_error_magnitude": 0.1, }, # ending_balance is bad }, }, @@ -4524,7 +4534,7 @@ "relative_error_magnitude": 0.028, }, "utility_id_ferc1": { - "error_frequency": 0.012, + "error_frequency": 0.017, "relative_error_magnitude": 0.066, }, }, From eb3b07e617ae5a6aed3a134e92ebe3e33c560c9c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 30 Oct 2023 19:44:30 +0000 Subject: [PATCH 34/37] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.1.1 → v0.1.3](https://github.com/astral-sh/ruff-pre-commit/compare/v0.1.1...v0.1.3) - [github.com/psf/black-pre-commit-mirror: 23.10.0 → 23.10.1](https://github.com/psf/black-pre-commit-mirror/compare/23.10.0...23.10.1) --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2d414f39d2..42eeff716b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,14 +26,14 @@ repos: # Formatters: hooks that re-write Python & documentation files #################################################################################### - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.1 + rev: v0.1.3 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] # Format the code - repo: https://github.com/psf/black-pre-commit-mirror - rev: 23.10.0 + rev: 23.10.1 hooks: - id: black language_version: python3.11 From 314476f7fcb7432884fa0c57a796c214b014219f Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Mon, 30 Oct 2023 16:02:59 -0400 Subject: [PATCH 35/37] edit tolerances for fast run --- src/pudl/transform/params/ferc1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pudl/transform/params/ferc1.py b/src/pudl/transform/params/ferc1.py index 2378769300..5749f7634d 100644 --- a/src/pudl/transform/params/ferc1.py +++ b/src/pudl/transform/params/ferc1.py @@ -4059,7 +4059,7 @@ "group_metric_checks": { "group_metric_tolerances": { "ungrouped": { - "error_frequency": 0.056, + "error_frequency": 0.058, "relative_error_magnitude": 0.045, }, "report_year": { @@ -4071,7 +4071,7 @@ "relative_error_magnitude": 0.66, }, "xbrl_factoid": { - "error_frequency": 0.48, + "error_frequency": 0.58, "relative_error_magnitude": 0.1, }, # ending_balance is bad }, From da4c3fa4075fb0808b45cf449192a5bca47f2844 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Mon, 30 Oct 2023 15:44:05 -0600 Subject: [PATCH 36/37] Increase balance sheet assets ungrouped error frequency tolernace from 0.00013 to 0.0002 --- src/pudl/transform/params/ferc1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl/transform/params/ferc1.py b/src/pudl/transform/params/ferc1.py index 5749f7634d..4f8d6758c7 100644 --- a/src/pudl/transform/params/ferc1.py +++ b/src/pudl/transform/params/ferc1.py @@ -3416,7 +3416,7 @@ "column_to_check": "ending_balance", "group_metric_checks": { "group_metric_tolerances": { - "ungrouped": {"error_frequency": 0.00013}, + "ungrouped": {"error_frequency": 0.0002}, }, }, }, From 548b958657884097aa6444366642c02dc9cbfa56 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Mon, 30 Oct 2023 14:01:41 -0800 Subject: [PATCH 37/37] Increase datasette cloud run memory from 4 to 32 GB --- devtools/datasette/publish.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devtools/datasette/publish.sh b/devtools/datasette/publish.sh index 3593a8ca32..53bed7d3f7 100755 --- a/devtools/datasette/publish.sh +++ b/devtools/datasette/publish.sh @@ -8,7 +8,7 @@ datasette_metadata_to_yml -o "metadata.yml" datasette publish cloudrun \ --service catalyst-datasette \ - --memory 4Gi \ + --memory 32Gi \ --install datasette-cluster-map \ --install datasette-vega \ --install datasette-block-robots \