Skip to content

Commit

Permalink
Support for missing values in categorical columns (#684)
Browse files Browse the repository at this point in the history
* Delegate column naming to tabmat

* Add tests

* More tests

* Test for dropping complete categories

* Add docstrings for new argument

* Add changelog entry

* Convert to pandas at the correct place

* Reorganize converting from pandas

* Remove xfail from test

* Implement missing categorical support

* Add test

* Solve adding missing category when predicting

* Apply Matthias' suggestions

* Add changelog entry
  • Loading branch information
stanmart authored Aug 28, 2023
1 parent 003fcec commit 91e0408
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Changelog
- Added a formula interface for specifying models.
- Improved feature name handling. Feature names are now created for non-pandas input matrices, too. Furthermore, the format of categorical features can be specified by the user.
- Term names are now stored in the model's attributes. This is useful for categorical features, where they refer to the whole variable, not just single levels.
- Added more options for treating missing values in categorical columns. They can either raise a `ValueError` (`"fail"`), be treated as all-zero indicators (`"zero"`) or represented as a new category (`"convert"`).
- `meth:GeneralizedLinearRegressor.wald_test` can now perform tests based on a formula string and term names.

2.6.0 - UNRELEASED
Expand Down
34 changes: 32 additions & 2 deletions src/glum/_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
_least_squares_solver,
_trust_constr_solver,
)
from ._util import _align_df_categories, _safe_toarray
from ._util import _add_missing_categories, _align_df_categories, _safe_toarray

_float_itemsize_to_dtype = {8: np.float64, 4: np.float32, 2: np.float16}

Expand Down Expand Up @@ -777,6 +777,8 @@ def __init__(
formula: Optional[FormulaSpec] = None,
interaction_separator: str = ":",
categorical_format: str = "{name}[{category}]",
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
):
self.l1_ratio = l1_ratio
self.P1 = P1
Expand Down Expand Up @@ -812,6 +814,8 @@ def __init__(
self.formula = formula
self.interaction_separator = interaction_separator
self.categorical_format = categorical_format
self.cat_missing_method = cat_missing_method
self.cat_missing_name = cat_missing_name

@property
def family_instance(self) -> ExponentialDispersionModel:
Expand Down Expand Up @@ -893,11 +897,20 @@ def _convert_from_pandas(self, df: pd.DataFrame) -> tm.MatrixBase:

if hasattr(self, "feature_dtypes_"):
df = _align_df_categories(df, self.feature_dtypes_)
if self.cat_missing_method == "convert":
df = _add_missing_categories(
df=df,
dtypes=self.feature_dtypes_,
feature_names=self.feature_names_,
cat_missing_name=self.cat_missing_name,
categorical_format=self.categorical_format,
)

X = tm.from_pandas(
df,
drop_first=self.drop_first,
categorical_format=self.categorical_format,
cat_missing_method=self.cat_missing_method,
)

return X
Expand Down Expand Up @@ -2654,6 +2667,8 @@ def _expand_categorical_penalties(penalty, X, drop_first):
X,
drop_first=self.drop_first,
categorical_format=self.categorical_format,
cat_missing_method=self.cat_missing_method,
cat_missing_name=self.cat_missing_name,
)

if y is None:
Expand Down Expand Up @@ -3032,12 +3047,23 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
Has to include the placeholders ``{name}`` and ``{category}``.
Only used if ``formula`` is not ``None``.
categorical_features : str, optional (default = "{name}[{category}]")
categorical_format : str, optional, default='{name}[{category}]'
Format string for categorical features. The format string should
contain the placeholder ``{name}`` for the feature name and
``{category}`` for the category name. Only used if ``X`` is a pandas
DataFrame.
cat_missing_method: str {'fail'|'zero'|'convert'}, default='fail'
How to handle missing values in categorical columns. Only used if ``X``
is a pandas data frame.
- if 'fail', raise an error if there are missing values
- if 'zero', missing values will represent all-zero indicator columns.
- if 'convert', missing values will be converted to the ``cat_missing_name``
category.
cat_missing_name: str, default='(MISSING)'
Name of the category to which missing values will be converted if
``cat_missing_method='convert'``. Only used if ``X`` is a pandas data frame.
Attributes
----------
coef_ : numpy.array, shape (n_features,)
Expand Down Expand Up @@ -3124,6 +3150,8 @@ def __init__(
formula: Optional[FormulaSpec] = None,
interaction_separator: str = ":",
categorical_format: str = "{name}[{category}]",
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
):
self.alphas = alphas
self.alpha = alpha
Expand Down Expand Up @@ -3162,6 +3190,8 @@ def __init__(
formula=formula,
interaction_separator=interaction_separator,
categorical_format=categorical_format,
cat_missing_method=cat_missing_method,
cat_missing_name=cat_missing_name,
)

def _validate_hyperparameters(self) -> None:
Expand Down
17 changes: 16 additions & 1 deletion src/glum/_glm_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,11 +299,22 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
If true, then the expected information matrix is computed by default.
Only relevant when computing robust standard errors.
categorical_features : str, optional (default = "{name}[{category}]")
categorical_format : str, optional (default = "{name}[{category}]")
Format string for categorical features. The format string should
contain the placeholder ``{name}`` for the feature name and
``{category}`` for the category name. Only used if ``X`` is a pandas
DataFrame.
cat_missing_method: str {'fail'|'zero'|'convert'}, default='fail'
How to handle missing values in categorical columns. Only used if ``X``
is a pandas data frame.
- if 'fail', raise an error if there are missing values
- if 'zero', missing values will represent all-zero indicator columns.
- if 'convert', missing values will be converted to the ``cat_missing_name``
category.
cat_missing_name: str, default='(MISSING)'
Name of the category to which missing values will be converted if
``cat_missing_method='convert'``. Only used if ``X`` is a pandas data frame.
"""

def __init__(
Expand Down Expand Up @@ -344,6 +355,8 @@ def __init__(
formula: Optional[FormulaSpec] = None,
interaction_separator: str = ":",
categorical_format: str = "{name}[{category}]",
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
):
self.alphas = alphas
self.cv = cv
Expand Down Expand Up @@ -382,6 +395,8 @@ def __init__(
formula=formula,
interaction_separator=interaction_separator,
categorical_format=categorical_format,
cat_missing_method=cat_missing_method,
cat_missing_name=cat_missing_name,
)

def _validate_hyperparameters(self) -> None:
Expand Down
40 changes: 39 additions & 1 deletion src/glum/_util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Union
from typing import Sequence, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -53,6 +53,44 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
return df


def _add_missing_categories(
df,
dtypes,
feature_names: Sequence[str],
categorical_format: str,
cat_missing_name: str,
) -> pd.DataFrame:
if not isinstance(df, pd.DataFrame):
raise TypeError(f"Expected `pandas.DataFrame'; got {type(df)}.")

changed_dtypes = {}

categorical_dtypes = [
column
for column, dtype in dtypes.items()
if pd.api.types.is_categorical_dtype(dtype) and (column in df)
]

for column in categorical_dtypes:
if (
categorical_format.format(name=column, category=cat_missing_name)
in feature_names
):
if cat_missing_name in df[column].cat.categories:
raise ValueError(
f"Missing category {cat_missing_name} already exists in {column}."
)
_logger.info(f"Adding missing category {cat_missing_name} to {column}.")
changed_dtypes[column] = df[column].cat.add_categories(cat_missing_name)
if df[column].isnull().any():
changed_dtypes[column] = changed_dtypes[column].fillna(cat_missing_name)

if changed_dtypes:
df = df.assign(**changed_dtypes)

return df


def _safe_lin_pred(
X: Union[MatrixBase, StandardizedMatrix],
coef: np.ndarray,
Expand Down
40 changes: 40 additions & 0 deletions tests/glm/test_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3147,3 +3147,43 @@ def test_formula_predict(get_mixed_data, formula):
yhat_smf = model_smf.predict(data_unseen)

np.testing.assert_almost_equal(yhat_formula, yhat_smf)


@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
def test_cat_missing(cat_missing_method):
X = pd.DataFrame(
{
"cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),
"cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]),
}
)
X_unseen = pd.DataFrame(
{
"cat_1": pd.Categorical([1, pd.NA]),
"cat_2": pd.Categorical([1, 2]),
}
)
y = np.array([1, 2, 3, 4, 5])

model = GeneralizedLinearRegressor(
family="normal",
cat_missing_method=cat_missing_method,
drop_first=False,
fit_intercept=False,
)

if cat_missing_method == "fail":
with pytest.raises(ValueError):
model.fit(X, y)
else:
model.fit(X, y)
feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"]

if cat_missing_method == "convert":
feature_names.insert(2, "cat_1[(MISSING)]")
feature_names.append("cat_2[(MISSING)]")

np.testing.assert_array_equal(model.feature_names_, feature_names)
assert len(model.coef_) == len(feature_names)

model.predict(X_unseen)
81 changes: 80 additions & 1 deletion tests/glm/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd
import pytest

from glum._util import _align_df_categories
from glum._util import _add_missing_categories, _align_df_categories


@pytest.fixture()
Expand Down Expand Up @@ -96,3 +96,82 @@ def test_align_df_categories_missing_columns(df):
def test_align_df_categories_not_df():
with pytest.raises(TypeError):
_align_df_categories(np.array([[0], [1]]), {"x0": np.float64})


@pytest.fixture()
def df_na():
return pd.DataFrame(
{
"num": np.array([0, 1], dtype="float64"),
"cat": pd.Categorical(["a", "b"]),
"cat_na": pd.Categorical(["a", pd.NA]),
"cat2": pd.Categorical(["a", "b"]),
}
)


def test_add_missing_categories(df_na):
categorical_format = "{name}[{category}]"
cat_missing_name = "(M)"
dtypes = df_na.dtypes
feature_names = [
"num",
"num[(M)]",
"cat[a]",
"cat[b]",
"cat[(M)]",
"cat_na[a]",
"cat_na[(M)]",
"cat2[a]",
"cat2[b]",
]

expected = pd.DataFrame(
{
"num": np.array([0, 1], dtype="float64"),
"cat": pd.Categorical(["a", "b"], categories=["a", "b", "(M)"]),
"cat_na": pd.Categorical(["a", "(M)"], categories=["a", "(M)"]),
"cat2": pd.Categorical(["a", "b"], categories=["a", "b"]),
}
)

pd.testing.assert_frame_equal(
_add_missing_categories(
df=df_na,
dtypes=dtypes,
feature_names=feature_names,
categorical_format=categorical_format,
cat_missing_name=cat_missing_name,
),
expected,
)


def test_raise_on_existing_missing(df_na):
categorical_format = "{name}[{category}]"
cat_missing_name = "(M)"
dtypes = df_na.dtypes
feature_names = [
"num",
"num[(M)]",
"cat[a]",
"cat[b]",
"cat[(M)]",
"cat_na[a]",
"cat_na[(M)]",
"cat2[a]",
"cat2[b]",
]

df = df_na
df["cat_na"] = df["cat_na"].cat.add_categories("(M)")
df.loc[df.cat_na.isna(), "cat_na"] = "(M)"

with pytest.raises(ValueError):
_add_missing_categories(
df=df,
dtypes=dtypes,
feature_names=feature_names,
categorical_format=categorical_format,
cat_missing_name=cat_missing_name,
)

0 comments on commit 91e0408

Please sign in to comment.