From bad03ebdac060300da8e3acb8782d92bfd2ff413 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 4 Nov 2024 13:06:03 +0100
Subject: [PATCH] MNT simplify internal bin_feature a bit (#183)

---
 src/model_diagnostics/_utils/binning.py       | 285 +++++++++---------
 .../calibration/identification.py             |  67 ++--
 .../calibration/tests/test_identification.py  |   4 +-
 3 files changed, 176 insertions(+), 180 deletions(-)

diff --git a/src/model_diagnostics/_utils/binning.py b/src/model_diagnostics/_utils/binning.py
index 1f5b5c7..3250103 100644
--- a/src/model_diagnostics/_utils/binning.py
+++ b/src/model_diagnostics/_utils/binning.py
@@ -9,14 +9,14 @@
 from model_diagnostics._utils.array import (
     array_name,
     is_pandas_series,
-    validate_same_first_dimension,
+    length_of_first_dimension,
 )
 
 
 def bin_feature(
     feature: Optional[Union[npt.ArrayLike, pl.Series]],
     feature_name: Optional[Union[int, str]],
-    y_obs: npt.ArrayLike,
+    n_obs: int,
     n_bins: int = 10,
     bin_method: str = "quantile",
 ):
@@ -26,13 +26,12 @@ def bin_feature(
 
     Parameters
     ----------
-    feature : array-like of shape (n_obs) or None
+    feature : array-like of shape (n_obs)
         Some feature column.
     feature_name : int, str or None
         Name of the feature.
-    y_obs : array-like of shape (n_obs)
-        Observed values of the response variable.
-        Only needed for validation of dimensions of feature.
+    n_obs : int
+        The expected length of the first dimention of feature.
     n_bins : int
         The number of bins for numerical features and the maximal number of (most
         frequent) categories shown for categorical features. Due to ties, the effective
@@ -48,12 +47,6 @@ def bin_feature(
     -------
     feature : pl.Series or None
         The polars.Series version of the feature.
-    feature_name : str
-        The name of the feature.
-    is_categorical : bool
-        True if feature is categorical or enum.
-    is_string : bool
-        True if feature is a string type.
     n_bins : int
         Effective number of bins.
     f_binned : pl.DataFrame or None
@@ -74,147 +67,147 @@ def bin_feature(
         )
         raise ValueError(msg)
 
-    if feature is None:
-        # TODO: Remove this branch.
-        feature_name = None
-    else:
-        if isinstance(feature_name, int):
-            default = f"feature {feature_name}"
-        else:
-            default = "feature"
-        feature_name = array_name(feature, default=default)
-        # The following statement, i.e. possibly the creation of a pl.Categorical,
-        # MUST be under the StringCache context manager!
-        try:
-            feature = pl.Series(name=feature_name, values=feature)
-        except ImportError:
-            # FIXME: pyarrow not installed
-            # For non numpy-backed columns, pyarrow is needed. Here we handle the case
-            # where pyarrow is not installed and such a pandas extention array is
-            # passed, e.g. with CategoricalDtype.
-            if is_pandas_series(feature):
-                pandas = sys.modules["pandas"]
-                is_pandas_categorical = isinstance(
-                    feature.dtype,  # type: ignore
-                    pandas.CategoricalDtype,
-                )
-                feature = pl.from_dataframe(feature.to_frame(name="0"))[:, 0]  # type: ignore
-                if is_pandas_categorical and isinstance(feature.dtype, pl.Enum):
-                    # Pandas categoricals usually get mapped to polars categoricals.
-                    # But this code path gives pl.Enum.
-                    feature = feature.cast(pl.Categorical)
-            else:
-                raise  # re-raises the ImportError
-        validate_same_first_dimension(y_obs, feature)
-        if feature.dtype in [pl.Categorical, pl.Enum]:
-            is_categorical = True
-        elif feature.dtype in [pl.Utf8, pl.Object]:
-            # We could convert strings to categoricals.
-            is_string = True
-        elif feature.dtype.is_float():
-            # We treat NaN as Null values, numpy will see a Null as a NaN.
-            feature = feature.fill_nan(None)
+    default = f"feature {feature_name}" if isinstance(feature_name, int) else "feature"
+    feature_name = array_name(feature, default=default)
+    # The following statement, i.e. possibly the creation of a pl.Categorical,
+    # MUST be under the StringCache context manager!
+    try:
+        feature = pl.Series(name=feature_name, values=feature)
+    except ImportError:
+        # FIXME: pyarrow not installed
+        # For non numpy-backed columns, pyarrow is needed. Here we handle the case
+        # where pyarrow is not installed and such a pandas extention array is
+        # passed, e.g. with CategoricalDtype.
+        if is_pandas_series(feature):
+            pandas = sys.modules["pandas"]
+            is_pandas_categorical = isinstance(
+                feature.dtype,  # type: ignore
+                pandas.CategoricalDtype,
+            )
+            feature = pl.from_dataframe(
+                feature.to_frame(name=feature_name)  # type: ignore
+            )[:, 0]
+            if is_pandas_categorical and isinstance(feature.dtype, pl.Enum):
+                # Pandas categoricals usually get mapped to polars categoricals.
+                # But this code path gives pl.Enum.
+                feature = feature.cast(pl.Categorical)
         else:
-            # integers
-            pass
+            raise  # re-raises the ImportError
+    if length_of_first_dimension(feature) != n_obs:
+        msg = (
+            f"The feature array {feature_name} does not have length {n_obs} of its"
+            " first dimension."
+        )
+        raise ValueError(msg)
+    if feature.dtype in [pl.Categorical, pl.Enum]:
+        is_categorical = True
+    elif feature.dtype in [pl.Utf8, pl.Object]:
+        # We could convert strings to categoricals.
+        is_string = True
+    elif feature.dtype.is_float():
+        # We treat NaN as Null values, numpy will see a Null as a NaN.
+        feature = feature.fill_nan(None)
+    else:
+        # integers
+        pass
 
-        if is_categorical or is_string:
-            # For categorical and string features, knowing the frequency table in
-            # advance makes life easier in order to make results consistent.
-            # Consider
-            #     feature  count
-            #         "a"      3
-            #         "b"      2
-            #         "c"      2
-            #         "d"      1
-            # with n_bins = 2. As we want the effective number of bins to be at
-            # most n_bins, we want, in the above case, only "a" in the final
-            # result. Therefore, we need to internally decrease n_bins to 1.
-            if feature.null_count() == 0:
-                value_count = feature.value_counts(sort=True)
-                n_bins_ef = n_bins
-            else:
-                value_count = feature.drop_nulls().value_counts(sort=True)
-                n_bins_ef = n_bins - 1
+    if is_categorical or is_string:
+        # For categorical and string features, knowing the frequency table in
+        # advance makes life easier in order to make results consistent.
+        # Consider
+        #     feature  count
+        #         "a"      3
+        #         "b"      2
+        #         "c"      2
+        #         "d"      1
+        # with n_bins = 2. As we want the effective number of bins to be at
+        # most n_bins, we want, in the above case, only "a" in the final
+        # result. Therefore, we need to internally decrease n_bins to 1.
+        if feature.null_count() == 0:
+            value_count = feature.value_counts(sort=True)
+            n_bins_ef = n_bins
+        else:
+            value_count = feature.drop_nulls().value_counts(sort=True)
+            n_bins_ef = n_bins - 1
 
-            if n_bins_ef >= value_count.shape[0]:
-                n_bins = value_count.shape[0]
+        if n_bins_ef >= value_count.shape[0]:
+            n_bins = value_count.shape[0]
+        else:
+            count_name = "count"
+            n = value_count[count_name][n_bins_ef]
+            n_bins_tmp = value_count.filter(pl.col(count_name) >= n).shape[0]
+            if n_bins_tmp > n_bins_ef:
+                n_bins = value_count.filter(pl.col(count_name) > n).shape[0]
             else:
-                count_name = "count"
-                n = value_count[count_name][n_bins_ef]
-                n_bins_tmp = value_count.filter(pl.col(count_name) >= n).shape[0]
-                if n_bins_tmp > n_bins_ef:
-                    n_bins = value_count.filter(pl.col(count_name) > n).shape[0]
-                else:
-                    n_bins = n_bins_tmp
+                n_bins = n_bins_tmp
 
-            if feature.null_count() >= 1:
-                n_bins += 1
+        if feature.null_count() >= 1:
+            n_bins += 1
 
-            if n_bins == 0:
-                msg = (
-                    "Due to ties, the effective number of bins is 0. "
-                    f"Consider to increase n_bins>={n_bins_tmp}."
-                )
-                warnings.warn(msg, UserWarning, stacklevel=2)
+        if n_bins == 0:
+            msg = (
+                "Due to ties, the effective number of bins is 0. "
+                f"Consider to increase n_bins>={n_bins_tmp}."
+            )
+            warnings.warn(msg, UserWarning, stacklevel=2)
+    else:
+        # Binning
+        # If we have Null values, we should reserve one bin for it and reduce
+        # the effective number of bins by 1.
+        n_bins_ef = max(1, n_bins - (feature.null_count() >= 1))
+        # We will need min and max anyway.
+        feature_min, feature_max = feature.min(), feature.max()
+        if bin_method == "quantile":
+            # We use method="inverted_cdf" instead of the default "linear" because
+            # "linear" produces as many unique values as before.
+            q = np.nanquantile(
+                feature,
+                # Improved rounding errors by using integers and dividing at the
+                # end as opposed to np.linspace with 1/n_bins step size.
+                q=np.arange(1, n_bins_ef) / n_bins_ef,
+                method="inverted_cdf",
+            )
+            bin_edges = np.unique(q)  # Some quantiles might be the same.
+        else:
+            # Uniform
+            f_range = feature_max - feature_min
+            bin_edges = feature_min + f_range * np.arange(1, n_bins_ef) / n_bins_ef
+        # We want: bins[i-1] < x <= bins[i]
+        f_binned = np.digitize(feature, bins=bin_edges, right=True)
+        # The full bin edges also include min and max of the feature.
+        if bin_edges.size == 0:
+            bin_edges = np.r_[feature_min, feature_max]
         else:
-            # Binning
-            # If we have Null values, we should reserve one bin for it and reduce
-            # the effective number of bins by 1.
-            n_bins_ef = max(1, n_bins - (feature.null_count() >= 1))
-            # We will need min and max anyway.
-            feature_min, feature_max = feature.min(), feature.max()
-            if bin_method == "quantile":
-                # We use method="inverted_cdf" instead of the default "linear" because
-                # "linear" produces as many unique values as before.
-                q = np.nanquantile(
+            bin_edges = np.r_[feature_min, bin_edges, feature_max]
+        # This is quite a hack with numpy strides and views. We want to accomplish
+        # bin_edges = [[value0, value1], [value1, value2], [value2, value3], ..]
+        bin_edges = np.lib.stride_tricks.as_strided(
+            bin_edges, (bin_edges.shape[0] - 1, 2), bin_edges.strides * 2
+        )
+        # Back to the binned feature.
+        # Now, we insert Null values again at the original places.
+        f_binned = (
+            pl.LazyFrame(
+                [
                     feature,
-                    # Improved rounding errors by using integers and dividing at the
-                    # end as opposed to np.linspace with 1/n_bins step size.
-                    q=np.arange(1, n_bins_ef) / n_bins_ef,
-                    method="inverted_cdf",
-                )
-                bin_edges = np.unique(q)  # Some quantiles might be the same.
-            else:
-                # Uniform
-                f_range = feature_max - feature_min
-                bin_edges = feature_min + f_range * np.arange(1, n_bins_ef) / n_bins_ef
-            # We want: bins[i-1] < x <= bins[i]
-            f_binned = np.digitize(feature, bins=bin_edges, right=True)
-            # The full bin edges also include min and max of the feature.
-            if bin_edges.size == 0:
-                bin_edges = np.r_[feature_min, feature_max]
-            else:
-                bin_edges = np.r_[feature_min, bin_edges, feature_max]
-            # This is quite a hack with numpy strides and views. We want to accomplish
-            # bin_edges = [[value0, value1], [value1, value2], [value2, value3], ..]
-            bin_edges = np.lib.stride_tricks.as_strided(
-                bin_edges, (bin_edges.shape[0] - 1, 2), bin_edges.strides * 2
+                    pl.Series("__f_binned", f_binned, dtype=feature.dtype),
+                    pl.Series(
+                        "__bin_edges",
+                        bin_edges[f_binned],
+                        dtype=pl.Array(pl.Float64, 2),
+                    ),
+                ]
             )
-            # Back to the binned feature.
-            # Now, we insert Null values again at the original places.
-            f_binned = (
-                pl.LazyFrame(
-                    [
-                        feature,
-                        pl.Series("__f_binned", f_binned, dtype=feature.dtype),
-                        pl.Series(
-                            "__bin_edges",
-                            bin_edges[f_binned],
-                            dtype=pl.Array(pl.Float64, 2),
-                        ),
-                    ]
-                )
-                .select(
-                    pl.when(pl.col(feature_name).is_null())
-                    .then(None)
-                    .otherwise(pl.col("__f_binned"))
-                    .alias("bin"),
-                    pl.when(pl.col(feature_name).is_null())
-                    .then(None)
-                    .otherwise(pl.col("__bin_edges"))
-                    .alias("bin_edges"),
-                )
-                .collect()
+            .select(
+                pl.when(pl.col(feature_name).is_null())
+                .then(None)
+                .otherwise(pl.col("__f_binned"))
+                .alias("bin"),
+                pl.when(pl.col(feature_name).is_null())
+                .then(None)
+                .otherwise(pl.col("__bin_edges"))
+                .alias("bin_edges"),
             )
-    return feature, feature_name, is_categorical, is_string, n_bins, f_binned
+            .collect()
+        )
+    return feature, n_bins, f_binned
diff --git a/src/model_diagnostics/calibration/identification.py b/src/model_diagnostics/calibration/identification.py
index 1d1a80e..a42702a 100644
--- a/src/model_diagnostics/calibration/identification.py
+++ b/src/model_diagnostics/calibration/identification.py
@@ -9,6 +9,7 @@
 from model_diagnostics._utils.array import (
     get_second_dimension,
     get_sorted_array_names,
+    length_of_first_dimension,
     length_of_second_dimension,
     validate_2_arrays,
     validate_same_first_dimension,
@@ -270,17 +271,25 @@ def compute_bias(
     else:
         w = np.ones_like(y_obs, dtype=float)
 
+    n_obs = length_of_first_dimension(y_pred)
     df_list = []
     with pl.StringCache():
-        feature, feature_name, is_categorical, is_string, n_bins, f_binned = (
-            bin_feature(
+        feature_name = None
+        if feature is not None:
+            feature, n_bins, f_binned = bin_feature(
                 feature=feature,
                 feature_name=None,
-                y_obs=y_obs,
+                n_obs=n_obs,
                 n_bins=n_bins,
                 bin_method=bin_method,
             )
-        )
+            feature_name = feature.name
+            is_cat_or_string = feature.dtype in [
+                pl.Categorical,
+                pl.Enum,
+                pl.Utf8,
+                pl.Object,
+            ]
 
         for i in range(len(pred_names)):
             # Loop over columns of y_pred.
@@ -333,7 +342,7 @@ def compute_bias(
                     ).alias("variance"),
                 ]
 
-                if is_categorical or is_string:
+                if is_cat_or_string:
                     groupby_name = feature_name
                 else:
                     df = df.hstack([f_binned.get_column("bin")])
@@ -384,14 +393,6 @@ def compute_bias(
                     )
                 ).collect()
 
-                # if is_categorical:
-                #     # Pyarrow does not yet support sorting dictionary type arrays,
-                #     # see https://github.com/apache/arrow/issues/29887
-                #     # We resort to pandas instead.
-                #     import pyarrow as pa
-                #     df = df.to_pandas().sort_values(feature_name)
-                #     df = pa.Table.from_pandas(df)
-
             # Add column with p-value of 2-sided t-test.
             # We explicitly convert "to_numpy", because otherwise we get:
             #   RuntimeWarning: A builtin ctypes object gave a PEP3118 format string
@@ -591,7 +592,7 @@ def compute_marginal(
 
     if feature_name is None:
         # X is completely ignored.
-        feature_input = None
+        feature_input = feature = None
     elif X is None:
         msg = (
             "X must be a data container like a (polars) dataframe or an (numpy) array."
@@ -608,22 +609,24 @@ def compute_marginal(
         feature_index = X_names.index(feature_name)
         feature_input = get_second_dimension(X, feature_index)
 
+    n_obs = length_of_first_dimension(y_pred)
     df_list = []
     with pl.StringCache():
-        (
-            feature,
-            feature_name,
-            is_categorical,
-            is_string,
-            n_bins,
-            f_binned,
-        ) = bin_feature(
-            feature=feature_input,
-            feature_name=feature_name,
-            y_obs=y_obs,
-            n_bins=n_bins,
-            bin_method=bin_method,
-        )
+        if feature_input is not None:
+            feature, n_bins, f_binned = bin_feature(
+                feature=feature_input,
+                feature_name=feature_name,
+                n_obs=n_obs,
+                n_bins=n_bins,
+                bin_method=bin_method,
+            )
+            feature_name = feature.name
+            is_cat_or_string = feature.dtype in [
+                pl.Categorical,
+                pl.Enum,
+                pl.Utf8,
+                pl.Object,
+            ]
 
         for i in range(len(pred_names)):
             # Loop over columns of y_pred.
@@ -679,7 +682,7 @@ def compute_marginal(
                     ),
                 ]
 
-                if is_categorical or is_string:
+                if is_cat_or_string:
                     groupby_name = feature_name
                 else:
                     # We also add the bin edges.
@@ -756,12 +759,12 @@ def compute_marginal(
                     ]
                     + (
                         []
-                        if is_categorical or is_string
+                        if is_cat_or_string
                         else [pl.col("bin_edges"), pl.col("__feature_std")]
                     )
                 ).collect()
 
-                if not is_categorical and not is_string:
+                if not is_cat_or_string:
                     df = df.with_columns(
                         pl.col("bin_edges")
                         .arr.first()
@@ -808,7 +811,7 @@ def compute_marginal(
                 "count",
                 "weights",
             ]
-            if feature_name in df.columns and not is_categorical and not is_string:
+            if feature_name in df.columns and not is_cat_or_string:
                 col_selection += ["bin_edges"]
             if with_pd:
                 col_selection += ["partial_dependence"]
diff --git a/src/model_diagnostics/calibration/tests/test_identification.py b/src/model_diagnostics/calibration/tests/test_identification.py
index 83691f7..8c63635 100644
--- a/src/model_diagnostics/calibration/tests/test_identification.py
+++ b/src/model_diagnostics/calibration/tests/test_identification.py
@@ -515,10 +515,10 @@ def test_compute_bias_raises_weights_shape():
 
 
 def test_compute_bias_raises_bin_method():
-    y_obs, y_pred = np.arange(20), np.arange(20)
+    y_obs, y_pred, feature = np.arange(5), np.arange(5), np.arange(5)
     msg = "Parameter bin_method must be either 'quantile' or ''uniform'"
     with pytest.raises(ValueError, match=msg):
-        compute_bias(y_obs, y_pred, n_bins=5, bin_method=None)
+        compute_bias(y_obs, y_pred, feature, n_bins=5, bin_method=None)
 
 
 @pytest.mark.parametrize(