Skip to content

Commit

Permalink
Pandas categorical support (#329)
Browse files Browse the repository at this point in the history
  • Loading branch information
jtilly authored Nov 23, 2020
1 parent 0654ad6 commit 46c8499
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 4 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,18 @@
Changelog
=========

1.1.0 - 2020-11-23
------------------

**New features:**

- Direct support for pandas categorical types in ``fit`` and ``predict``. These will be converted into a ``CategoricalMatrix``.

1.0.1 - 2020-11-12
------------------

This is a maintenance release to be compatible with `quantcore.matrix>=1.0.0`.


1.0.0 - 2020-11-11
------------------

Expand Down
33 changes: 30 additions & 3 deletions src/quantcore/glm/_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

import copy
import warnings
from itertools import chain
from typing import Any, Iterable, List, Optional, Tuple, Union

import numpy as np
Expand Down Expand Up @@ -102,6 +103,9 @@
def check_array_matrix_compliant(mat: ArrayLike, **kwargs):
to_copy = "copy" in kwargs.keys() and kwargs["copy"]

if isinstance(mat, pd.DataFrame) and any(mat.dtypes == "category"):
mat = mx.from_pandas(mat)

if isinstance(mat, mx.SplitMatrix):
kwargs.update({"ensure_min_features": 0})
new_matrices = [check_array_matrix_compliant(m, **kwargs) for m in mat.matrices]
Expand Down Expand Up @@ -1020,7 +1024,9 @@ def linear_predictor(
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Samples.
Samples. This may be a Pandas data frame with categorical dtypes.
In that case the user must ensure that the categories are exactly
the same (including the order) as during fit.
offset: {None, array-like}, shape (n_samples,), optional \
(default=None)
Expand Down Expand Up @@ -1066,7 +1072,9 @@ def predict(
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Samples.
Samples. This may be a Pandas data frame with categorical dtypes.
In that case the user must ensure that the categories are exactly
the same (including the order) as during fit.
sample_weight : {None, array-like}, shape (n_samples,), optional \
(default=None)
Expand Down Expand Up @@ -1335,6 +1343,21 @@ def set_up_and_check_fit_args(

copy_X = self._should_copy_X()

if isinstance(X, pd.DataFrame):

if any(X.dtypes == "category"):
self.feature_names_ = list(
chain.from_iterable(
[f"{column}__{category}" for category in dtype.categories]
if pd.api.types.is_categorical_dtype(dtype)
else [column]
for column, dtype in zip(X.columns, X.dtypes)
)
)
X = mx.from_pandas(X)
else:
self.feature_names_ = X.columns

if not self._is_contiguous(X):
if self.copy_X is not None and not self.copy_X:
raise ValueError(
Expand Down Expand Up @@ -1799,7 +1822,11 @@ def fit(
Training data. Note that a float32 matrix is acceptable and will
result in the entire algorithm being run in 32-bit precision.
However, for problems that are poorly conditioned, this might result
in poor convergence or flawed parameter estimates.
in poor convergence or flawed parameter estimates. If a Pandas data
frame is provided, it may contain categorical columns. In that case,
a separate coefficient will be estimated for each category. No category
is omitted. This means that some regularization is required to fit models
with an intercept or models with several categorical columns.
y : array-like, shape (n_samples,)
Target values.
Expand Down
63 changes: 63 additions & 0 deletions tests/glm/test_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1655,3 +1655,66 @@ def test_passing_noncontiguous_as_X():
)
np.testing.assert_almost_equal(baseline.coef_, np_view.coef_)
np.testing.assert_almost_equal(baseline.coef_, pd_view.coef_)


@pytest.mark.parametrize(
"X, feature_names",
[
(pd.DataFrame({"x1": np.arange(5), "x2": 2}), np.array(["x1", "x2"])),
(pd.DataFrame({"x1": np.arange(5), "x2": 2}).to_numpy(), None),
(
pd.DataFrame({"x1": pd.Categorical(np.arange(5)), "x2": 2}),
np.array(["x1__0", "x1__1", "x1__2", "x1__3", "x1__4", "x2"]),
),
(
pd.DataFrame(
{
"x1": pd.Categorical(np.arange(5)),
"x2": pd.Categorical([2, 2, 2, 2, 2]),
}
),
np.array(["x1__0", "x1__1", "x1__2", "x1__3", "x1__4", "x2__2"]),
),
],
)
def test_feature_names(X, feature_names):
model = GeneralizedLinearRegressor(family="poisson").fit(X, np.arange(5))
np.testing.assert_array_equal(getattr(model, "feature_names_", None), feature_names)


@pytest.mark.parametrize(
"k, n",
[
(5, 5),
(10, 5),
(100, 5),
(500, 50),
(500, 100),
(500, 500),
],
)
def test_categorical_types(k, n):
np.random.seed(12345)
categories = np.arange(k)
group = np.random.choice(categories, size=n)
y = group / k + np.random.uniform(size=n)

# use categorical types
X_cat = pd.DataFrame({"group": pd.Categorical(group, categories=categories)})
model_cat = GeneralizedLinearRegressor(family="poisson").fit(X_cat, y)
pred_cat = model_cat.predict(X_cat)

# use one-hot encoding
X_oh = pd.get_dummies(X_cat)
model_oh = GeneralizedLinearRegressor(family="poisson").fit(X_oh, y)
pred_oh = model_oh.predict(X_oh)

# check predictions
np.testing.assert_allclose(pred_cat, pred_oh)
np.testing.assert_allclose(model_cat.intercept_, model_oh.intercept_)
np.testing.assert_allclose(model_cat.coef_, model_oh.coef_)

# compare across models/data types
pred_cat_oh = model_cat.predict(X_oh)
pred_oh_cat = model_oh.predict(X_cat)
np.testing.assert_allclose(pred_cat_oh, pred_oh_cat)

0 comments on commit 46c8499

Please sign in to comment.