From bad03ebdac060300da8e3acb8782d92bfd2ff413 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 4 Nov 2024 13:06:03 +0100 Subject: [PATCH] MNT simplify internal bin_feature a bit (#183) --- src/model_diagnostics/_utils/binning.py | 285 +++++++++--------- .../calibration/identification.py | 67 ++-- .../calibration/tests/test_identification.py | 4 +- 3 files changed, 176 insertions(+), 180 deletions(-) diff --git a/src/model_diagnostics/_utils/binning.py b/src/model_diagnostics/_utils/binning.py index 1f5b5c7..3250103 100644 --- a/src/model_diagnostics/_utils/binning.py +++ b/src/model_diagnostics/_utils/binning.py @@ -9,14 +9,14 @@ from model_diagnostics._utils.array import ( array_name, is_pandas_series, - validate_same_first_dimension, + length_of_first_dimension, ) def bin_feature( feature: Optional[Union[npt.ArrayLike, pl.Series]], feature_name: Optional[Union[int, str]], - y_obs: npt.ArrayLike, + n_obs: int, n_bins: int = 10, bin_method: str = "quantile", ): @@ -26,13 +26,12 @@ def bin_feature( Parameters ---------- - feature : array-like of shape (n_obs) or None + feature : array-like of shape (n_obs) Some feature column. feature_name : int, str or None Name of the feature. - y_obs : array-like of shape (n_obs) - Observed values of the response variable. - Only needed for validation of dimensions of feature. + n_obs : int + The expected length of the first dimention of feature. n_bins : int The number of bins for numerical features and the maximal number of (most frequent) categories shown for categorical features. Due to ties, the effective @@ -48,12 +47,6 @@ def bin_feature( ------- feature : pl.Series or None The polars.Series version of the feature. - feature_name : str - The name of the feature. - is_categorical : bool - True if feature is categorical or enum. - is_string : bool - True if feature is a string type. n_bins : int Effective number of bins. f_binned : pl.DataFrame or None @@ -74,147 +67,147 @@ def bin_feature( ) raise ValueError(msg) - if feature is None: - # TODO: Remove this branch. - feature_name = None - else: - if isinstance(feature_name, int): - default = f"feature {feature_name}" - else: - default = "feature" - feature_name = array_name(feature, default=default) - # The following statement, i.e. possibly the creation of a pl.Categorical, - # MUST be under the StringCache context manager! - try: - feature = pl.Series(name=feature_name, values=feature) - except ImportError: - # FIXME: pyarrow not installed - # For non numpy-backed columns, pyarrow is needed. Here we handle the case - # where pyarrow is not installed and such a pandas extention array is - # passed, e.g. with CategoricalDtype. - if is_pandas_series(feature): - pandas = sys.modules["pandas"] - is_pandas_categorical = isinstance( - feature.dtype, # type: ignore - pandas.CategoricalDtype, - ) - feature = pl.from_dataframe(feature.to_frame(name="0"))[:, 0] # type: ignore - if is_pandas_categorical and isinstance(feature.dtype, pl.Enum): - # Pandas categoricals usually get mapped to polars categoricals. - # But this code path gives pl.Enum. - feature = feature.cast(pl.Categorical) - else: - raise # re-raises the ImportError - validate_same_first_dimension(y_obs, feature) - if feature.dtype in [pl.Categorical, pl.Enum]: - is_categorical = True - elif feature.dtype in [pl.Utf8, pl.Object]: - # We could convert strings to categoricals. - is_string = True - elif feature.dtype.is_float(): - # We treat NaN as Null values, numpy will see a Null as a NaN. - feature = feature.fill_nan(None) + default = f"feature {feature_name}" if isinstance(feature_name, int) else "feature" + feature_name = array_name(feature, default=default) + # The following statement, i.e. possibly the creation of a pl.Categorical, + # MUST be under the StringCache context manager! + try: + feature = pl.Series(name=feature_name, values=feature) + except ImportError: + # FIXME: pyarrow not installed + # For non numpy-backed columns, pyarrow is needed. Here we handle the case + # where pyarrow is not installed and such a pandas extention array is + # passed, e.g. with CategoricalDtype. + if is_pandas_series(feature): + pandas = sys.modules["pandas"] + is_pandas_categorical = isinstance( + feature.dtype, # type: ignore + pandas.CategoricalDtype, + ) + feature = pl.from_dataframe( + feature.to_frame(name=feature_name) # type: ignore + )[:, 0] + if is_pandas_categorical and isinstance(feature.dtype, pl.Enum): + # Pandas categoricals usually get mapped to polars categoricals. + # But this code path gives pl.Enum. + feature = feature.cast(pl.Categorical) else: - # integers - pass + raise # re-raises the ImportError + if length_of_first_dimension(feature) != n_obs: + msg = ( + f"The feature array {feature_name} does not have length {n_obs} of its" + " first dimension." + ) + raise ValueError(msg) + if feature.dtype in [pl.Categorical, pl.Enum]: + is_categorical = True + elif feature.dtype in [pl.Utf8, pl.Object]: + # We could convert strings to categoricals. + is_string = True + elif feature.dtype.is_float(): + # We treat NaN as Null values, numpy will see a Null as a NaN. + feature = feature.fill_nan(None) + else: + # integers + pass - if is_categorical or is_string: - # For categorical and string features, knowing the frequency table in - # advance makes life easier in order to make results consistent. - # Consider - # feature count - # "a" 3 - # "b" 2 - # "c" 2 - # "d" 1 - # with n_bins = 2. As we want the effective number of bins to be at - # most n_bins, we want, in the above case, only "a" in the final - # result. Therefore, we need to internally decrease n_bins to 1. - if feature.null_count() == 0: - value_count = feature.value_counts(sort=True) - n_bins_ef = n_bins - else: - value_count = feature.drop_nulls().value_counts(sort=True) - n_bins_ef = n_bins - 1 + if is_categorical or is_string: + # For categorical and string features, knowing the frequency table in + # advance makes life easier in order to make results consistent. + # Consider + # feature count + # "a" 3 + # "b" 2 + # "c" 2 + # "d" 1 + # with n_bins = 2. As we want the effective number of bins to be at + # most n_bins, we want, in the above case, only "a" in the final + # result. Therefore, we need to internally decrease n_bins to 1. + if feature.null_count() == 0: + value_count = feature.value_counts(sort=True) + n_bins_ef = n_bins + else: + value_count = feature.drop_nulls().value_counts(sort=True) + n_bins_ef = n_bins - 1 - if n_bins_ef >= value_count.shape[0]: - n_bins = value_count.shape[0] + if n_bins_ef >= value_count.shape[0]: + n_bins = value_count.shape[0] + else: + count_name = "count" + n = value_count[count_name][n_bins_ef] + n_bins_tmp = value_count.filter(pl.col(count_name) >= n).shape[0] + if n_bins_tmp > n_bins_ef: + n_bins = value_count.filter(pl.col(count_name) > n).shape[0] else: - count_name = "count" - n = value_count[count_name][n_bins_ef] - n_bins_tmp = value_count.filter(pl.col(count_name) >= n).shape[0] - if n_bins_tmp > n_bins_ef: - n_bins = value_count.filter(pl.col(count_name) > n).shape[0] - else: - n_bins = n_bins_tmp + n_bins = n_bins_tmp - if feature.null_count() >= 1: - n_bins += 1 + if feature.null_count() >= 1: + n_bins += 1 - if n_bins == 0: - msg = ( - "Due to ties, the effective number of bins is 0. " - f"Consider to increase n_bins>={n_bins_tmp}." - ) - warnings.warn(msg, UserWarning, stacklevel=2) + if n_bins == 0: + msg = ( + "Due to ties, the effective number of bins is 0. " + f"Consider to increase n_bins>={n_bins_tmp}." + ) + warnings.warn(msg, UserWarning, stacklevel=2) + else: + # Binning + # If we have Null values, we should reserve one bin for it and reduce + # the effective number of bins by 1. + n_bins_ef = max(1, n_bins - (feature.null_count() >= 1)) + # We will need min and max anyway. + feature_min, feature_max = feature.min(), feature.max() + if bin_method == "quantile": + # We use method="inverted_cdf" instead of the default "linear" because + # "linear" produces as many unique values as before. + q = np.nanquantile( + feature, + # Improved rounding errors by using integers and dividing at the + # end as opposed to np.linspace with 1/n_bins step size. + q=np.arange(1, n_bins_ef) / n_bins_ef, + method="inverted_cdf", + ) + bin_edges = np.unique(q) # Some quantiles might be the same. + else: + # Uniform + f_range = feature_max - feature_min + bin_edges = feature_min + f_range * np.arange(1, n_bins_ef) / n_bins_ef + # We want: bins[i-1] < x <= bins[i] + f_binned = np.digitize(feature, bins=bin_edges, right=True) + # The full bin edges also include min and max of the feature. + if bin_edges.size == 0: + bin_edges = np.r_[feature_min, feature_max] else: - # Binning - # If we have Null values, we should reserve one bin for it and reduce - # the effective number of bins by 1. - n_bins_ef = max(1, n_bins - (feature.null_count() >= 1)) - # We will need min and max anyway. - feature_min, feature_max = feature.min(), feature.max() - if bin_method == "quantile": - # We use method="inverted_cdf" instead of the default "linear" because - # "linear" produces as many unique values as before. - q = np.nanquantile( + bin_edges = np.r_[feature_min, bin_edges, feature_max] + # This is quite a hack with numpy strides and views. We want to accomplish + # bin_edges = [[value0, value1], [value1, value2], [value2, value3], ..] + bin_edges = np.lib.stride_tricks.as_strided( + bin_edges, (bin_edges.shape[0] - 1, 2), bin_edges.strides * 2 + ) + # Back to the binned feature. + # Now, we insert Null values again at the original places. + f_binned = ( + pl.LazyFrame( + [ feature, - # Improved rounding errors by using integers and dividing at the - # end as opposed to np.linspace with 1/n_bins step size. - q=np.arange(1, n_bins_ef) / n_bins_ef, - method="inverted_cdf", - ) - bin_edges = np.unique(q) # Some quantiles might be the same. - else: - # Uniform - f_range = feature_max - feature_min - bin_edges = feature_min + f_range * np.arange(1, n_bins_ef) / n_bins_ef - # We want: bins[i-1] < x <= bins[i] - f_binned = np.digitize(feature, bins=bin_edges, right=True) - # The full bin edges also include min and max of the feature. - if bin_edges.size == 0: - bin_edges = np.r_[feature_min, feature_max] - else: - bin_edges = np.r_[feature_min, bin_edges, feature_max] - # This is quite a hack with numpy strides and views. We want to accomplish - # bin_edges = [[value0, value1], [value1, value2], [value2, value3], ..] - bin_edges = np.lib.stride_tricks.as_strided( - bin_edges, (bin_edges.shape[0] - 1, 2), bin_edges.strides * 2 + pl.Series("__f_binned", f_binned, dtype=feature.dtype), + pl.Series( + "__bin_edges", + bin_edges[f_binned], + dtype=pl.Array(pl.Float64, 2), + ), + ] ) - # Back to the binned feature. - # Now, we insert Null values again at the original places. - f_binned = ( - pl.LazyFrame( - [ - feature, - pl.Series("__f_binned", f_binned, dtype=feature.dtype), - pl.Series( - "__bin_edges", - bin_edges[f_binned], - dtype=pl.Array(pl.Float64, 2), - ), - ] - ) - .select( - pl.when(pl.col(feature_name).is_null()) - .then(None) - .otherwise(pl.col("__f_binned")) - .alias("bin"), - pl.when(pl.col(feature_name).is_null()) - .then(None) - .otherwise(pl.col("__bin_edges")) - .alias("bin_edges"), - ) - .collect() + .select( + pl.when(pl.col(feature_name).is_null()) + .then(None) + .otherwise(pl.col("__f_binned")) + .alias("bin"), + pl.when(pl.col(feature_name).is_null()) + .then(None) + .otherwise(pl.col("__bin_edges")) + .alias("bin_edges"), ) - return feature, feature_name, is_categorical, is_string, n_bins, f_binned + .collect() + ) + return feature, n_bins, f_binned diff --git a/src/model_diagnostics/calibration/identification.py b/src/model_diagnostics/calibration/identification.py index 1d1a80e..a42702a 100644 --- a/src/model_diagnostics/calibration/identification.py +++ b/src/model_diagnostics/calibration/identification.py @@ -9,6 +9,7 @@ from model_diagnostics._utils.array import ( get_second_dimension, get_sorted_array_names, + length_of_first_dimension, length_of_second_dimension, validate_2_arrays, validate_same_first_dimension, @@ -270,17 +271,25 @@ def compute_bias( else: w = np.ones_like(y_obs, dtype=float) + n_obs = length_of_first_dimension(y_pred) df_list = [] with pl.StringCache(): - feature, feature_name, is_categorical, is_string, n_bins, f_binned = ( - bin_feature( + feature_name = None + if feature is not None: + feature, n_bins, f_binned = bin_feature( feature=feature, feature_name=None, - y_obs=y_obs, + n_obs=n_obs, n_bins=n_bins, bin_method=bin_method, ) - ) + feature_name = feature.name + is_cat_or_string = feature.dtype in [ + pl.Categorical, + pl.Enum, + pl.Utf8, + pl.Object, + ] for i in range(len(pred_names)): # Loop over columns of y_pred. @@ -333,7 +342,7 @@ def compute_bias( ).alias("variance"), ] - if is_categorical or is_string: + if is_cat_or_string: groupby_name = feature_name else: df = df.hstack([f_binned.get_column("bin")]) @@ -384,14 +393,6 @@ def compute_bias( ) ).collect() - # if is_categorical: - # # Pyarrow does not yet support sorting dictionary type arrays, - # # see https://github.com/apache/arrow/issues/29887 - # # We resort to pandas instead. - # import pyarrow as pa - # df = df.to_pandas().sort_values(feature_name) - # df = pa.Table.from_pandas(df) - # Add column with p-value of 2-sided t-test. # We explicitly convert "to_numpy", because otherwise we get: # RuntimeWarning: A builtin ctypes object gave a PEP3118 format string @@ -591,7 +592,7 @@ def compute_marginal( if feature_name is None: # X is completely ignored. - feature_input = None + feature_input = feature = None elif X is None: msg = ( "X must be a data container like a (polars) dataframe or an (numpy) array." @@ -608,22 +609,24 @@ def compute_marginal( feature_index = X_names.index(feature_name) feature_input = get_second_dimension(X, feature_index) + n_obs = length_of_first_dimension(y_pred) df_list = [] with pl.StringCache(): - ( - feature, - feature_name, - is_categorical, - is_string, - n_bins, - f_binned, - ) = bin_feature( - feature=feature_input, - feature_name=feature_name, - y_obs=y_obs, - n_bins=n_bins, - bin_method=bin_method, - ) + if feature_input is not None: + feature, n_bins, f_binned = bin_feature( + feature=feature_input, + feature_name=feature_name, + n_obs=n_obs, + n_bins=n_bins, + bin_method=bin_method, + ) + feature_name = feature.name + is_cat_or_string = feature.dtype in [ + pl.Categorical, + pl.Enum, + pl.Utf8, + pl.Object, + ] for i in range(len(pred_names)): # Loop over columns of y_pred. @@ -679,7 +682,7 @@ def compute_marginal( ), ] - if is_categorical or is_string: + if is_cat_or_string: groupby_name = feature_name else: # We also add the bin edges. @@ -756,12 +759,12 @@ def compute_marginal( ] + ( [] - if is_categorical or is_string + if is_cat_or_string else [pl.col("bin_edges"), pl.col("__feature_std")] ) ).collect() - if not is_categorical and not is_string: + if not is_cat_or_string: df = df.with_columns( pl.col("bin_edges") .arr.first() @@ -808,7 +811,7 @@ def compute_marginal( "count", "weights", ] - if feature_name in df.columns and not is_categorical and not is_string: + if feature_name in df.columns and not is_cat_or_string: col_selection += ["bin_edges"] if with_pd: col_selection += ["partial_dependence"] diff --git a/src/model_diagnostics/calibration/tests/test_identification.py b/src/model_diagnostics/calibration/tests/test_identification.py index 83691f7..8c63635 100644 --- a/src/model_diagnostics/calibration/tests/test_identification.py +++ b/src/model_diagnostics/calibration/tests/test_identification.py @@ -515,10 +515,10 @@ def test_compute_bias_raises_weights_shape(): def test_compute_bias_raises_bin_method(): - y_obs, y_pred = np.arange(20), np.arange(20) + y_obs, y_pred, feature = np.arange(5), np.arange(5), np.arange(5) msg = "Parameter bin_method must be either 'quantile' or ''uniform'" with pytest.raises(ValueError, match=msg): - compute_bias(y_obs, y_pred, n_bins=5, bin_method=None) + compute_bias(y_obs, y_pred, feature, n_bins=5, bin_method=None) @pytest.mark.parametrize(