Pandas categorical support (#329)

Quantco · Nov 23, 2020 · 46c8499 · 46c8499
1 parent 0654ad6
commit 46c8499
Show file tree

Hide file tree

Showing 3 changed files with 100 additions and 4 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,12 +7,18 @@
 Changelog
 =========
 
+1.1.0 - 2020-11-23
+------------------
+
+**New features:**
+
+- Direct support for pandas categorical types in ``fit`` and ``predict``. These will be converted into a ``CategoricalMatrix``.
+
 1.0.1 - 2020-11-12
 ------------------
 
 This is a maintenance release to be compatible with `quantcore.matrix>=1.0.0`.
 
-
 1.0.0 - 2020-11-11
 ------------------
 

diff --git a/src/quantcore/glm/_glm.py b/src/quantcore/glm/_glm.py
@@ -40,6 +40,7 @@
 
 import copy
 import warnings
+from itertools import chain
 from typing import Any, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
@@ -102,6 +103,9 @@
 def check_array_matrix_compliant(mat: ArrayLike, **kwargs):
     to_copy = "copy" in kwargs.keys() and kwargs["copy"]
 
+    if isinstance(mat, pd.DataFrame) and any(mat.dtypes == "category"):
+        mat = mx.from_pandas(mat)
+
     if isinstance(mat, mx.SplitMatrix):
         kwargs.update({"ensure_min_features": 0})
         new_matrices = [check_array_matrix_compliant(m, **kwargs) for m in mat.matrices]
@@ -1020,7 +1024,9 @@ def linear_predictor(
         Parameters
         ----------
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            Samples.
+            Samples. This may be a Pandas data frame with categorical dtypes.
+            In that case the user must ensure that the categories are exactly
+            the same (including the order) as during fit.
 
         offset: {None, array-like}, shape (n_samples,), optional \
                 (default=None)
@@ -1066,7 +1072,9 @@ def predict(
         Parameters
         ----------
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            Samples.
+            Samples. This may be a Pandas data frame with categorical dtypes.
+            In that case the user must ensure that the categories are exactly
+            the same (including the order) as during fit.
 
         sample_weight : {None, array-like}, shape (n_samples,), optional \
                 (default=None)
@@ -1335,6 +1343,21 @@ def set_up_and_check_fit_args(
 
         copy_X = self._should_copy_X()
 
+        if isinstance(X, pd.DataFrame):
+
+            if any(X.dtypes == "category"):
+                self.feature_names_ = list(
+                    chain.from_iterable(
+                        [f"{column}__{category}" for category in dtype.categories]
+                        if pd.api.types.is_categorical_dtype(dtype)
+                        else [column]
+                        for column, dtype in zip(X.columns, X.dtypes)
+                    )
+                )
+                X = mx.from_pandas(X)
+            else:
+                self.feature_names_ = X.columns
+
         if not self._is_contiguous(X):
             if self.copy_X is not None and not self.copy_X:
                 raise ValueError(
@@ -1799,7 +1822,11 @@ def fit(
             Training data. Note that a float32 matrix is acceptable and will
             result in the entire algorithm being run in 32-bit precision.
             However, for problems that are poorly conditioned, this might result
-            in poor convergence or flawed parameter estimates.
+            in poor convergence or flawed parameter estimates. If a Pandas data
+            frame is provided, it may contain categorical columns. In that case,
+            a separate coefficient will be estimated for each category. No category
+            is omitted. This means that some regularization is required to fit models
+            with an intercept or models with several categorical columns.
 
         y : array-like, shape (n_samples,)
             Target values.

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
@@ -1655,3 +1655,66 @@ def test_passing_noncontiguous_as_X():
     )
     np.testing.assert_almost_equal(baseline.coef_, np_view.coef_)
     np.testing.assert_almost_equal(baseline.coef_, pd_view.coef_)
+
+
+@pytest.mark.parametrize(
+    "X, feature_names",
+    [
+        (pd.DataFrame({"x1": np.arange(5), "x2": 2}), np.array(["x1", "x2"])),
+        (pd.DataFrame({"x1": np.arange(5), "x2": 2}).to_numpy(), None),
+        (
+            pd.DataFrame({"x1": pd.Categorical(np.arange(5)), "x2": 2}),
+            np.array(["x1__0", "x1__1", "x1__2", "x1__3", "x1__4", "x2"]),
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "x1": pd.Categorical(np.arange(5)),
+                    "x2": pd.Categorical([2, 2, 2, 2, 2]),
+                }
+            ),
+            np.array(["x1__0", "x1__1", "x1__2", "x1__3", "x1__4", "x2__2"]),
+        ),
+    ],
+)
+def test_feature_names(X, feature_names):
+    model = GeneralizedLinearRegressor(family="poisson").fit(X, np.arange(5))
+    np.testing.assert_array_equal(getattr(model, "feature_names_", None), feature_names)
+
+
+@pytest.mark.parametrize(
+    "k, n",
+    [
+        (5, 5),
+        (10, 5),
+        (100, 5),
+        (500, 50),
+        (500, 100),
+        (500, 500),
+    ],
+)
+def test_categorical_types(k, n):
+    np.random.seed(12345)
+    categories = np.arange(k)
+    group = np.random.choice(categories, size=n)
+    y = group / k + np.random.uniform(size=n)
+
+    # use categorical types
+    X_cat = pd.DataFrame({"group": pd.Categorical(group, categories=categories)})
+    model_cat = GeneralizedLinearRegressor(family="poisson").fit(X_cat, y)
+    pred_cat = model_cat.predict(X_cat)
+
+    # use one-hot encoding
+    X_oh = pd.get_dummies(X_cat)
+    model_oh = GeneralizedLinearRegressor(family="poisson").fit(X_oh, y)
+    pred_oh = model_oh.predict(X_oh)
+
+    # check predictions
+    np.testing.assert_allclose(pred_cat, pred_oh)
+    np.testing.assert_allclose(model_cat.intercept_, model_oh.intercept_)
+    np.testing.assert_allclose(model_cat.coef_, model_oh.coef_)
+
+    # compare across models/data types
+    pred_cat_oh = model_cat.predict(X_oh)
+    pred_oh_cat = model_oh.predict(X_cat)
+    np.testing.assert_allclose(pred_cat_oh, pred_oh_cat)