Support for missing values in categorical columns (#684)

* Delegate column naming to tabmat * Add tests * More tests * Test for dropping complete categories * Add docstrings for new argument * Add changelog entry * Convert to pandas at the correct place * Reorganize converting from pandas * Remove xfail from test * Implement missing categorical support * Add test * Solve adding missing category when predicting * Apply Matthias' suggestions * Add changelog entry
Quantco · Aug 28, 2023 · 91e0408 · 91e0408
1 parent 003fcec
commit 91e0408
Show file tree

Hide file tree

Showing 6 changed files with 208 additions and 5 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -15,6 +15,7 @@ Changelog
 - Added a formula interface for specifying models.
 - Improved feature name handling. Feature names are now created for non-pandas input matrices, too. Furthermore, the format of categorical features can be specified by the user.
 - Term names are now stored in the model's attributes. This is useful for categorical features, where they refer to the whole variable, not just single levels.
+- Added more options for treating missing values in categorical columns. They can either raise a `ValueError` (`"fail"`), be treated as all-zero indicators (`"zero"`) or represented as a new category (`"convert"`).
 - `meth:GeneralizedLinearRegressor.wald_test` can now perform tests based on a formula string and term names.
 
 2.6.0 - UNRELEASED

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
@@ -63,7 +63,7 @@
     _least_squares_solver,
     _trust_constr_solver,
 )
-from ._util import _align_df_categories, _safe_toarray
+from ._util import _add_missing_categories, _align_df_categories, _safe_toarray
 
 _float_itemsize_to_dtype = {8: np.float64, 4: np.float32, 2: np.float16}
 
@@ -777,6 +777,8 @@ def __init__(
         formula: Optional[FormulaSpec] = None,
         interaction_separator: str = ":",
         categorical_format: str = "{name}[{category}]",
+        cat_missing_method: str = "fail",
+        cat_missing_name: str = "(MISSING)",
     ):
         self.l1_ratio = l1_ratio
         self.P1 = P1
@@ -812,6 +814,8 @@ def __init__(
         self.formula = formula
         self.interaction_separator = interaction_separator
         self.categorical_format = categorical_format
+        self.cat_missing_method = cat_missing_method
+        self.cat_missing_name = cat_missing_name
 
     @property
     def family_instance(self) -> ExponentialDispersionModel:
@@ -893,11 +897,20 @@ def _convert_from_pandas(self, df: pd.DataFrame) -> tm.MatrixBase:
 
         if hasattr(self, "feature_dtypes_"):
             df = _align_df_categories(df, self.feature_dtypes_)
+            if self.cat_missing_method == "convert":
+                df = _add_missing_categories(
+                    df=df,
+                    dtypes=self.feature_dtypes_,
+                    feature_names=self.feature_names_,
+                    cat_missing_name=self.cat_missing_name,
+                    categorical_format=self.categorical_format,
+                )
 
         X = tm.from_pandas(
             df,
             drop_first=self.drop_first,
             categorical_format=self.categorical_format,
+            cat_missing_method=self.cat_missing_method,
         )
 
         return X
@@ -2654,6 +2667,8 @@ def _expand_categorical_penalties(penalty, X, drop_first):
                     X,
                     drop_first=self.drop_first,
                     categorical_format=self.categorical_format,
+                    cat_missing_method=self.cat_missing_method,
+                    cat_missing_name=self.cat_missing_name,
                 )
 
         if y is None:
@@ -3032,12 +3047,23 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         Has to include the placeholders ``{name}`` and ``{category}``.
         Only used if ``formula`` is not ``None``.
 
-    categorical_features : str, optional (default = "{name}[{category}]")
+    categorical_format : str, optional, default='{name}[{category}]'
         Format string for categorical features. The format string should
         contain the placeholder ``{name}`` for the feature name and
         ``{category}`` for the category name. Only used if ``X`` is a pandas
         DataFrame.
 
+    cat_missing_method: str {'fail'|'zero'|'convert'}, default='fail'
+        How to handle missing values in categorical columns. Only used if ``X``
+        is a pandas data frame.
+        - if 'fail', raise an error if there are missing values
+        - if 'zero', missing values will represent all-zero indicator columns.
+        - if 'convert', missing values will be converted to the ``cat_missing_name``
+          category.
+    cat_missing_name: str, default='(MISSING)'
+        Name of the category to which missing values will be converted if
+        ``cat_missing_method='convert'``.  Only used if ``X`` is a pandas data frame.
+
     Attributes
     ----------
     coef_ : numpy.array, shape (n_features,)
@@ -3124,6 +3150,8 @@ def __init__(
         formula: Optional[FormulaSpec] = None,
         interaction_separator: str = ":",
         categorical_format: str = "{name}[{category}]",
+        cat_missing_method: str = "fail",
+        cat_missing_name: str = "(MISSING)",
     ):
         self.alphas = alphas
         self.alpha = alpha
@@ -3162,6 +3190,8 @@ def __init__(
             formula=formula,
             interaction_separator=interaction_separator,
             categorical_format=categorical_format,
+            cat_missing_method=cat_missing_method,
+            cat_missing_name=cat_missing_name,
         )
 
     def _validate_hyperparameters(self) -> None:

diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
@@ -299,11 +299,22 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
         If true, then the expected information matrix is computed by default.
         Only relevant when computing robust standard errors.
 
-    categorical_features : str, optional (default = "{name}[{category}]")
+    categorical_format : str, optional (default = "{name}[{category}]")
         Format string for categorical features. The format string should
         contain the placeholder ``{name}`` for the feature name and
         ``{category}`` for the category name. Only used if ``X`` is a pandas
         DataFrame.
+
+    cat_missing_method: str {'fail'|'zero'|'convert'}, default='fail'
+        How to handle missing values in categorical columns. Only used if ``X``
+        is a pandas data frame.
+        - if 'fail', raise an error if there are missing values
+        - if 'zero', missing values will represent all-zero indicator columns.
+        - if 'convert', missing values will be converted to the ``cat_missing_name``
+          category.
+    cat_missing_name: str, default='(MISSING)'
+        Name of the category to which missing values will be converted if
+        ``cat_missing_method='convert'``.  Only used if ``X`` is a pandas data frame.
     """
 
     def __init__(
@@ -344,6 +355,8 @@ def __init__(
         formula: Optional[FormulaSpec] = None,
         interaction_separator: str = ":",
         categorical_format: str = "{name}[{category}]",
+        cat_missing_method: str = "fail",
+        cat_missing_name: str = "(MISSING)",
     ):
         self.alphas = alphas
         self.cv = cv
@@ -382,6 +395,8 @@ def __init__(
             formula=formula,
             interaction_separator=interaction_separator,
             categorical_format=categorical_format,
+            cat_missing_method=cat_missing_method,
+            cat_missing_name=cat_missing_name,
         )
 
     def _validate_hyperparameters(self) -> None:

diff --git a/src/glum/_util.py b/src/glum/_util.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Union
+from typing import Sequence, Union
 
 import numpy as np
 import pandas as pd
@@ -53,6 +53,44 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
     return df
 
 
+def _add_missing_categories(
+    df,
+    dtypes,
+    feature_names: Sequence[str],
+    categorical_format: str,
+    cat_missing_name: str,
+) -> pd.DataFrame:
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError(f"Expected `pandas.DataFrame'; got {type(df)}.")
+
+    changed_dtypes = {}
+
+    categorical_dtypes = [
+        column
+        for column, dtype in dtypes.items()
+        if pd.api.types.is_categorical_dtype(dtype) and (column in df)
+    ]
+
+    for column in categorical_dtypes:
+        if (
+            categorical_format.format(name=column, category=cat_missing_name)
+            in feature_names
+        ):
+            if cat_missing_name in df[column].cat.categories:
+                raise ValueError(
+                    f"Missing category {cat_missing_name} already exists in {column}."
+                )
+            _logger.info(f"Adding missing category {cat_missing_name} to {column}.")
+            changed_dtypes[column] = df[column].cat.add_categories(cat_missing_name)
+            if df[column].isnull().any():
+                changed_dtypes[column] = changed_dtypes[column].fillna(cat_missing_name)
+
+    if changed_dtypes:
+        df = df.assign(**changed_dtypes)
+
+    return df
+
+
 def _safe_lin_pred(
     X: Union[MatrixBase, StandardizedMatrix],
     coef: np.ndarray,

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
@@ -3147,3 +3147,43 @@ def test_formula_predict(get_mixed_data, formula):
     yhat_smf = model_smf.predict(data_unseen)
 
     np.testing.assert_almost_equal(yhat_formula, yhat_smf)
+
+
+@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
+def test_cat_missing(cat_missing_method):
+    X = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),
+            "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]),
+        }
+    )
+    X_unseen = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical([1, pd.NA]),
+            "cat_2": pd.Categorical([1, 2]),
+        }
+    )
+    y = np.array([1, 2, 3, 4, 5])
+
+    model = GeneralizedLinearRegressor(
+        family="normal",
+        cat_missing_method=cat_missing_method,
+        drop_first=False,
+        fit_intercept=False,
+    )
+
+    if cat_missing_method == "fail":
+        with pytest.raises(ValueError):
+            model.fit(X, y)
+    else:
+        model.fit(X, y)
+        feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"]
+
+        if cat_missing_method == "convert":
+            feature_names.insert(2, "cat_1[(MISSING)]")
+            feature_names.append("cat_2[(MISSING)]")
+
+        np.testing.assert_array_equal(model.feature_names_, feature_names)
+        assert len(model.coef_) == len(feature_names)
+
+        model.predict(X_unseen)
diff --git a/tests/glm/test_utils.py b/tests/glm/test_utils.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import pytest
 
-from glum._util import _align_df_categories
+from glum._util import _add_missing_categories, _align_df_categories
 
 
 @pytest.fixture()
@@ -96,3 +96,82 @@ def test_align_df_categories_missing_columns(df):
 def test_align_df_categories_not_df():
     with pytest.raises(TypeError):
         _align_df_categories(np.array([[0], [1]]), {"x0": np.float64})
+
+
+@pytest.fixture()
+def df_na():
+    return pd.DataFrame(
+        {
+            "num": np.array([0, 1], dtype="float64"),
+            "cat": pd.Categorical(["a", "b"]),
+            "cat_na": pd.Categorical(["a", pd.NA]),
+            "cat2": pd.Categorical(["a", "b"]),
+        }
+    )
+
+
+def test_add_missing_categories(df_na):
+    categorical_format = "{name}[{category}]"
+    cat_missing_name = "(M)"
+    dtypes = df_na.dtypes
+    feature_names = [
+        "num",
+        "num[(M)]",
+        "cat[a]",
+        "cat[b]",
+        "cat[(M)]",
+        "cat_na[a]",
+        "cat_na[(M)]",
+        "cat2[a]",
+        "cat2[b]",
+    ]
+
+    expected = pd.DataFrame(
+        {
+            "num": np.array([0, 1], dtype="float64"),
+            "cat": pd.Categorical(["a", "b"], categories=["a", "b", "(M)"]),
+            "cat_na": pd.Categorical(["a", "(M)"], categories=["a", "(M)"]),
+            "cat2": pd.Categorical(["a", "b"], categories=["a", "b"]),
+        }
+    )
+
+    pd.testing.assert_frame_equal(
+        _add_missing_categories(
+            df=df_na,
+            dtypes=dtypes,
+            feature_names=feature_names,
+            categorical_format=categorical_format,
+            cat_missing_name=cat_missing_name,
+        ),
+        expected,
+    )
+
+
+def test_raise_on_existing_missing(df_na):
+    categorical_format = "{name}[{category}]"
+    cat_missing_name = "(M)"
+    dtypes = df_na.dtypes
+    feature_names = [
+        "num",
+        "num[(M)]",
+        "cat[a]",
+        "cat[b]",
+        "cat[(M)]",
+        "cat_na[a]",
+        "cat_na[(M)]",
+        "cat2[a]",
+        "cat2[b]",
+    ]
+
+    df = df_na
+    df["cat_na"] = df["cat_na"].cat.add_categories("(M)")
+    df.loc[df.cat_na.isna(), "cat_na"] = "(M)"
+
+    with pytest.raises(ValueError):
+        _add_missing_categories(
+            df=df,
+            dtypes=dtypes,
+            feature_names=feature_names,
+            categorical_format=categorical_format,
+            cat_missing_name=cat_missing_name,
+        )