feat(EstimatorReport): Display the mean decrease impurity (#1368)

probabl-ai · Mar 7, 2025 · b01ea2c · b01ea2c
1 parent 36a3a62
commit b01ea2c
Show file tree

Hide file tree

Showing 5 changed files with 338 additions and 13 deletions.
diff --git a/skore/src/skore/sklearn/_estimator/feature_importance_accessor.py b/skore/src/skore/sklearn/_estimator/feature_importance_accessor.py
@@ -13,7 +13,7 @@
 from skore.externals._pandas_accessors import DirNamesMixin
 from skore.sklearn._base import _BaseAccessor
 from skore.sklearn._estimator.report import EstimatorReport
-from skore.utils._accessor import _check_has_coef
+from skore.utils._accessor import _check_has_coef, _check_has_feature_importances
 
 DataSource = Literal["test", "train", "X_y"]
 
@@ -74,27 +74,31 @@ def coefficients(self) -> pd.DataFrame:
         Feature #8    250.5...
         Feature #9     99.5...
         """
-        estimator = self._parent.estimator_
+        parent_estimator = self._parent.estimator_
 
-        if isinstance(estimator, Pipeline):
-            feature_names = estimator[:-1].get_feature_names_out()
+        if isinstance(parent_estimator, Pipeline):
+            feature_names = parent_estimator[:-1].get_feature_names_out()
         else:
-            if hasattr(estimator, "feature_names_in_"):
-                feature_names = estimator.feature_names_in_
+            if hasattr(parent_estimator, "feature_names_in_"):
+                feature_names = parent_estimator.feature_names_in_
             else:
                 feature_names = [
-                    f"Feature #{i}" for i in range(estimator.n_features_in_)
+                    f"Feature #{i}" for i in range(parent_estimator.n_features_in_)
                 ]
 
-        linear_model = estimator[-1] if isinstance(estimator, Pipeline) else estimator
-        intercept = np.atleast_2d(linear_model.intercept_)
-        coef = np.atleast_2d(linear_model.coef_)
+        estimator = (
+            parent_estimator[-1]
+            if isinstance(parent_estimator, Pipeline)
+            else parent_estimator
+        )
+        intercept = np.atleast_2d(estimator.intercept_)
+        coef = np.atleast_2d(estimator.coef_)
 
         data = np.concatenate([intercept, coef.T])
 
         if data.shape[1] == 1:
             columns = ["Coefficient"]
-        elif is_classifier(estimator):
+        elif is_classifier(parent_estimator):
             columns = [f"Class #{i}" for i in range(data.shape[1])]
         else:
             columns = [f"Target #{i}" for i in range(data.shape[1])]
@@ -107,6 +111,67 @@ def coefficients(self) -> pd.DataFrame:
 
         return df
 
+    @available_if(_check_has_feature_importances())
+    def mean_decrease_impurity(self):
+        """Retrieve the mean decrease impurity (MDI) of a tree-based model.
+
+        This method is available for estimators that expose a `feature_importances_`
+        attribute. See for example the
+        `sklearn.ensemble.GradientBoostingClassifier documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier.feature_importances_>`_.
+        In particular, note that the MDI is computed at fit time, i.e. using the
+        training data.
+
+        Examples
+        --------
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.ensemble import RandomForestClassifier
+        >>> from sklearn.model_selection import train_test_split
+        >>> from skore import EstimatorReport
+        >>> X, y = make_classification(n_features=5, random_state=42)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+        >>> forest = RandomForestClassifier(n_estimators=5, random_state=0)
+        >>> report = EstimatorReport(
+        ...     forest,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> report.feature_importance.mean_decrease_impurity()
+                   Mean decrease impurity
+        Feature #0                0.06...
+        Feature #1                0.19...
+        Feature #2                0.01...
+        Feature #3                0.69...
+        Feature #4                0.02...
+        """
+        parent_estimator = self._parent.estimator_
+        estimator = (
+            parent_estimator.steps[-1][1]
+            if isinstance(parent_estimator, Pipeline)
+            else parent_estimator
+        )
+
+        data = estimator.feature_importances_
+
+        if isinstance(parent_estimator, Pipeline):
+            feature_names = parent_estimator[:-1].get_feature_names_out()
+        else:
+            if hasattr(parent_estimator, "feature_names_in_"):
+                feature_names = parent_estimator.feature_names_in_
+            else:
+                feature_names = [
+                    f"Feature #{i}" for i in range(parent_estimator.n_features_in_)
+                ]
+
+        df = pd.DataFrame(
+            data=data,
+            index=feature_names,
+            columns=["Mean decrease impurity"],
+        )
+
+        return df
+
     def feature_permutation(
         self,
         *,

diff --git a/skore/src/skore/utils/_accessor.py b/skore/src/skore/utils/_accessor.py
@@ -32,7 +32,26 @@ def check(accessor: Any) -> bool:
         if hasattr(estimator, "coef_"):
             return True
         raise AttributeError(
-            f"Estimator {accessor._parent.estimator_} is not a supported estimator by "
+            f"Estimator {parent_estimator} is not a supported estimator by "
+            "the function called."
+        )
+
+    return check
+
+
+def _check_has_feature_importances() -> Callable:
+    def check(accessor: Any) -> bool:
+        """Check if the estimator has a `feature_importances_` attribute."""
+        parent_estimator = accessor._parent.estimator_
+        estimator = (
+            parent_estimator.steps[-1][1]
+            if isinstance(parent_estimator, Pipeline)
+            else parent_estimator
+        )
+        if hasattr(estimator, "feature_importances_"):
+            return True
+        raise AttributeError(
+            f"Estimator {parent_estimator} is not a supported estimator by "
             "the function called."
         )
 

diff --git a/skore/tests/unit/sklearn/estimator/feature_importance/test_mean_decrease_impurity.py b/skore/tests/unit/sklearn/estimator/feature_importance/test_mean_decrease_impurity.py
@@ -0,0 +1,201 @@
+import pandas as pd
+import pytest
+import sklearn
+from sklearn.base import is_regressor
+from sklearn.datasets import make_classification, make_regression
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import StandardScaler
+from skore import EstimatorReport
+
+
+@pytest.mark.parametrize(
+    "data, estimator, expected_shape",
+    [
+        (
+            make_classification(n_features=5, random_state=42),
+            RandomForestClassifier(n_estimators=2, random_state=0),
+            (5, 1),
+        ),
+        (
+            make_classification(n_features=5, random_state=42),
+            RandomForestClassifier(n_estimators=2, random_state=0),
+            (5, 1),
+        ),
+        (
+            make_classification(
+                n_features=5,
+                n_classes=3,
+                n_samples=30,
+                n_informative=3,
+                random_state=42,
+            ),
+            RandomForestClassifier(n_estimators=2, random_state=0),
+            (5, 1),
+        ),
+        (
+            make_classification(
+                n_features=5,
+                n_classes=3,
+                n_samples=30,
+                n_informative=3,
+                random_state=42,
+            ),
+            make_pipeline(
+                StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0)
+            ),
+            (5, 1),
+        ),
+        (
+            make_classification(n_features=5, random_state=42),
+            make_pipeline(
+                StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0)
+            ),
+            (5, 1),
+        ),
+        (
+            make_regression(n_features=5, n_targets=3, random_state=42),
+            RandomForestRegressor(n_estimators=2, random_state=0),
+            (5, 1),
+        ),
+    ],
+)
+def test_numpy_arrays(data, estimator, expected_shape):
+    X, y = data
+    estimator.fit(X, y)
+    report = EstimatorReport(estimator)
+    result = report.feature_importance.mean_decrease_impurity()
+
+    assert result.shape == expected_shape
+
+    expected_index = (
+        [f"x{i}" for i in range(X.shape[1])]
+        if isinstance(estimator, Pipeline)
+        else [f"Feature #{i}" for i in range(X.shape[1])]
+    )
+    assert result.index.tolist() == expected_index
+
+    expected_columns = ["Mean decrease impurity"]
+    assert result.columns.tolist() == expected_columns
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        RandomForestClassifier(n_estimators=2, random_state=0),
+        make_pipeline(
+            StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0)
+        ),
+    ],
+)
+def test_pandas_dataframe(estimator):
+    """If provided, the `mean_decrease_impurity` dataframe uses the feature names."""
+    X, y = make_classification(n_features=5, random_state=42)
+    X = pd.DataFrame(X, columns=[f"my_feature_{i}" for i in range(X.shape[1])])
+    estimator.fit(X, y)
+
+    report = EstimatorReport(estimator)
+    result = report.feature_importance.mean_decrease_impurity()
+
+    assert result.shape == (5, 1)
+    assert result.index.tolist() == [
+        "my_feature_0",
+        "my_feature_1",
+        "my_feature_2",
+        "my_feature_3",
+        "my_feature_4",
+    ]
+    assert result.columns.tolist() == ["Mean decrease impurity"]
+
+
+def _make_estimator_param(estimator):
+    return pytest.param(estimator, id=estimator.__class__.__name__)
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        _make_estimator_param(estimator)
+        for estimator in [
+            sklearn.ensemble.AdaBoostClassifier(n_estimators=2),
+            sklearn.ensemble.AdaBoostRegressor(n_estimators=2),
+            sklearn.ensemble.ExtraTreesClassifier(n_estimators=2),
+            sklearn.ensemble.ExtraTreesRegressor(n_estimators=2),
+            sklearn.ensemble.GradientBoostingClassifier(n_estimators=2),
+            sklearn.ensemble.GradientBoostingRegressor(n_estimators=2),
+            sklearn.ensemble.RandomForestClassifier(n_estimators=2),
+            sklearn.ensemble.RandomForestRegressor(n_estimators=2),
+            sklearn.ensemble.RandomTreesEmbedding(n_estimators=2),
+            sklearn.tree.DecisionTreeClassifier(),
+            sklearn.tree.DecisionTreeRegressor(),
+            sklearn.tree.ExtraTreeClassifier(),
+            sklearn.tree.ExtraTreeRegressor(),
+        ]
+    ],
+)
+def test_all_sklearn_estimators(
+    request, estimator, regression_data, classification_data
+):
+    """Check that `mean_decrease_impurity` is supported for every sklearn estimator."""
+    if is_regressor(estimator):
+        X, y = regression_data
+    else:
+        X, y = classification_data
+
+    estimator.fit(X, y)
+
+    report = EstimatorReport(estimator)
+    result = report.feature_importance.mean_decrease_impurity()
+
+    assert result.shape == (5, 1)
+    assert result.index.tolist() == [
+        "Feature #0",
+        "Feature #1",
+        "Feature #2",
+        "Feature #3",
+        "Feature #4",
+    ]
+    assert result.columns.tolist() == ["Mean decrease impurity"]
+
+
+def test_pipeline_with_transformer(regression_data):
+    """If the estimator is a pipeline containing a transformer that changes the
+    features, adapt the feature names in the output table."""
+    from sklearn.preprocessing import PolynomialFeatures
+
+    X, y = regression_data
+    X = pd.DataFrame(X, columns=[f"my_feature_{i}" for i in range(5)])
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    model = make_pipeline(
+        PolynomialFeatures(degree=2, interaction_only=True),
+        RandomForestRegressor(n_estimators=2, random_state=0),
+    )
+
+    report = EstimatorReport(
+        model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test
+    )
+
+    result = report.feature_importance.mean_decrease_impurity()
+    assert result.shape == (16, 1)
+    assert result.index.tolist() == [
+        "1",
+        "my_feature_0",
+        "my_feature_1",
+        "my_feature_2",
+        "my_feature_3",
+        "my_feature_4",
+        "my_feature_0 my_feature_1",
+        "my_feature_0 my_feature_2",
+        "my_feature_0 my_feature_3",
+        "my_feature_0 my_feature_4",
+        "my_feature_1 my_feature_2",
+        "my_feature_1 my_feature_3",
+        "my_feature_1 my_feature_4",
+        "my_feature_2 my_feature_3",
+        "my_feature_2 my_feature_4",
+        "my_feature_3 my_feature_4",
+    ]
+    assert result.columns.tolist() == ["Mean decrease impurity"]