Skip to content

Commit

Permalink
feat(EstimatorReport): Display the mean decrease impurity (#1368)
Browse files Browse the repository at this point in the history
  • Loading branch information
auguste-probabl committed Mar 7, 2025
1 parent 36a3a62 commit b01ea2c
Show file tree
Hide file tree
Showing 5 changed files with 338 additions and 13 deletions.
87 changes: 76 additions & 11 deletions skore/src/skore/sklearn/_estimator/feature_importance_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from skore.externals._pandas_accessors import DirNamesMixin
from skore.sklearn._base import _BaseAccessor
from skore.sklearn._estimator.report import EstimatorReport
from skore.utils._accessor import _check_has_coef
from skore.utils._accessor import _check_has_coef, _check_has_feature_importances

DataSource = Literal["test", "train", "X_y"]

Expand Down Expand Up @@ -74,27 +74,31 @@ def coefficients(self) -> pd.DataFrame:
Feature #8 250.5...
Feature #9 99.5...
"""
estimator = self._parent.estimator_
parent_estimator = self._parent.estimator_

if isinstance(estimator, Pipeline):
feature_names = estimator[:-1].get_feature_names_out()
if isinstance(parent_estimator, Pipeline):
feature_names = parent_estimator[:-1].get_feature_names_out()
else:
if hasattr(estimator, "feature_names_in_"):
feature_names = estimator.feature_names_in_
if hasattr(parent_estimator, "feature_names_in_"):
feature_names = parent_estimator.feature_names_in_
else:
feature_names = [
f"Feature #{i}" for i in range(estimator.n_features_in_)
f"Feature #{i}" for i in range(parent_estimator.n_features_in_)
]

linear_model = estimator[-1] if isinstance(estimator, Pipeline) else estimator
intercept = np.atleast_2d(linear_model.intercept_)
coef = np.atleast_2d(linear_model.coef_)
estimator = (
parent_estimator[-1]
if isinstance(parent_estimator, Pipeline)
else parent_estimator
)
intercept = np.atleast_2d(estimator.intercept_)
coef = np.atleast_2d(estimator.coef_)

data = np.concatenate([intercept, coef.T])

if data.shape[1] == 1:
columns = ["Coefficient"]
elif is_classifier(estimator):
elif is_classifier(parent_estimator):
columns = [f"Class #{i}" for i in range(data.shape[1])]
else:
columns = [f"Target #{i}" for i in range(data.shape[1])]
Expand All @@ -107,6 +111,67 @@ def coefficients(self) -> pd.DataFrame:

return df

@available_if(_check_has_feature_importances())
def mean_decrease_impurity(self):
"""Retrieve the mean decrease impurity (MDI) of a tree-based model.
This method is available for estimators that expose a `feature_importances_`
attribute. See for example the
`sklearn.ensemble.GradientBoostingClassifier documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier.feature_importances_>`_.
In particular, note that the MDI is computed at fit time, i.e. using the
training data.
Examples
--------
>>> from sklearn.datasets import make_classification
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.model_selection import train_test_split
>>> from skore import EstimatorReport
>>> X, y = make_classification(n_features=5, random_state=42)
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
>>> forest = RandomForestClassifier(n_estimators=5, random_state=0)
>>> report = EstimatorReport(
... forest,
... X_train=X_train,
... y_train=y_train,
... X_test=X_test,
... y_test=y_test,
... )
>>> report.feature_importance.mean_decrease_impurity()
Mean decrease impurity
Feature #0 0.06...
Feature #1 0.19...
Feature #2 0.01...
Feature #3 0.69...
Feature #4 0.02...
"""
parent_estimator = self._parent.estimator_
estimator = (
parent_estimator.steps[-1][1]
if isinstance(parent_estimator, Pipeline)
else parent_estimator
)

data = estimator.feature_importances_

if isinstance(parent_estimator, Pipeline):
feature_names = parent_estimator[:-1].get_feature_names_out()
else:
if hasattr(parent_estimator, "feature_names_in_"):
feature_names = parent_estimator.feature_names_in_
else:
feature_names = [
f"Feature #{i}" for i in range(parent_estimator.n_features_in_)
]

df = pd.DataFrame(
data=data,
index=feature_names,
columns=["Mean decrease impurity"],
)

return df

def feature_permutation(
self,
*,
Expand Down
21 changes: 20 additions & 1 deletion skore/src/skore/utils/_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,26 @@ def check(accessor: Any) -> bool:
if hasattr(estimator, "coef_"):
return True
raise AttributeError(
f"Estimator {accessor._parent.estimator_} is not a supported estimator by "
f"Estimator {parent_estimator} is not a supported estimator by "
"the function called."
)

return check


def _check_has_feature_importances() -> Callable:
def check(accessor: Any) -> bool:
"""Check if the estimator has a `feature_importances_` attribute."""
parent_estimator = accessor._parent.estimator_
estimator = (
parent_estimator.steps[-1][1]
if isinstance(parent_estimator, Pipeline)
else parent_estimator
)
if hasattr(estimator, "feature_importances_"):
return True
raise AttributeError(
f"Estimator {parent_estimator} is not a supported estimator by "
"the function called."
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
import pandas as pd
import pytest
import sklearn
from sklearn.base import is_regressor
from sklearn.datasets import make_classification, make_regression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from skore import EstimatorReport


@pytest.mark.parametrize(
"data, estimator, expected_shape",
[
(
make_classification(n_features=5, random_state=42),
RandomForestClassifier(n_estimators=2, random_state=0),
(5, 1),
),
(
make_classification(n_features=5, random_state=42),
RandomForestClassifier(n_estimators=2, random_state=0),
(5, 1),
),
(
make_classification(
n_features=5,
n_classes=3,
n_samples=30,
n_informative=3,
random_state=42,
),
RandomForestClassifier(n_estimators=2, random_state=0),
(5, 1),
),
(
make_classification(
n_features=5,
n_classes=3,
n_samples=30,
n_informative=3,
random_state=42,
),
make_pipeline(
StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0)
),
(5, 1),
),
(
make_classification(n_features=5, random_state=42),
make_pipeline(
StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0)
),
(5, 1),
),
(
make_regression(n_features=5, n_targets=3, random_state=42),
RandomForestRegressor(n_estimators=2, random_state=0),
(5, 1),
),
],
)
def test_numpy_arrays(data, estimator, expected_shape):
X, y = data
estimator.fit(X, y)
report = EstimatorReport(estimator)
result = report.feature_importance.mean_decrease_impurity()

assert result.shape == expected_shape

expected_index = (
[f"x{i}" for i in range(X.shape[1])]
if isinstance(estimator, Pipeline)
else [f"Feature #{i}" for i in range(X.shape[1])]
)
assert result.index.tolist() == expected_index

expected_columns = ["Mean decrease impurity"]
assert result.columns.tolist() == expected_columns


@pytest.mark.parametrize(
"estimator",
[
RandomForestClassifier(n_estimators=2, random_state=0),
make_pipeline(
StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0)
),
],
)
def test_pandas_dataframe(estimator):
"""If provided, the `mean_decrease_impurity` dataframe uses the feature names."""
X, y = make_classification(n_features=5, random_state=42)
X = pd.DataFrame(X, columns=[f"my_feature_{i}" for i in range(X.shape[1])])
estimator.fit(X, y)

report = EstimatorReport(estimator)
result = report.feature_importance.mean_decrease_impurity()

assert result.shape == (5, 1)
assert result.index.tolist() == [
"my_feature_0",
"my_feature_1",
"my_feature_2",
"my_feature_3",
"my_feature_4",
]
assert result.columns.tolist() == ["Mean decrease impurity"]


def _make_estimator_param(estimator):
return pytest.param(estimator, id=estimator.__class__.__name__)


@pytest.mark.parametrize(
"estimator",
[
_make_estimator_param(estimator)
for estimator in [
sklearn.ensemble.AdaBoostClassifier(n_estimators=2),
sklearn.ensemble.AdaBoostRegressor(n_estimators=2),
sklearn.ensemble.ExtraTreesClassifier(n_estimators=2),
sklearn.ensemble.ExtraTreesRegressor(n_estimators=2),
sklearn.ensemble.GradientBoostingClassifier(n_estimators=2),
sklearn.ensemble.GradientBoostingRegressor(n_estimators=2),
sklearn.ensemble.RandomForestClassifier(n_estimators=2),
sklearn.ensemble.RandomForestRegressor(n_estimators=2),
sklearn.ensemble.RandomTreesEmbedding(n_estimators=2),
sklearn.tree.DecisionTreeClassifier(),
sklearn.tree.DecisionTreeRegressor(),
sklearn.tree.ExtraTreeClassifier(),
sklearn.tree.ExtraTreeRegressor(),
]
],
)
def test_all_sklearn_estimators(
request, estimator, regression_data, classification_data
):
"""Check that `mean_decrease_impurity` is supported for every sklearn estimator."""
if is_regressor(estimator):
X, y = regression_data
else:
X, y = classification_data

estimator.fit(X, y)

report = EstimatorReport(estimator)
result = report.feature_importance.mean_decrease_impurity()

assert result.shape == (5, 1)
assert result.index.tolist() == [
"Feature #0",
"Feature #1",
"Feature #2",
"Feature #3",
"Feature #4",
]
assert result.columns.tolist() == ["Mean decrease impurity"]


def test_pipeline_with_transformer(regression_data):
"""If the estimator is a pipeline containing a transformer that changes the
features, adapt the feature names in the output table."""
from sklearn.preprocessing import PolynomialFeatures

X, y = regression_data
X = pd.DataFrame(X, columns=[f"my_feature_{i}" for i in range(5)])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

model = make_pipeline(
PolynomialFeatures(degree=2, interaction_only=True),
RandomForestRegressor(n_estimators=2, random_state=0),
)

report = EstimatorReport(
model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test
)

result = report.feature_importance.mean_decrease_impurity()
assert result.shape == (16, 1)
assert result.index.tolist() == [
"1",
"my_feature_0",
"my_feature_1",
"my_feature_2",
"my_feature_3",
"my_feature_4",
"my_feature_0 my_feature_1",
"my_feature_0 my_feature_2",
"my_feature_0 my_feature_3",
"my_feature_0 my_feature_4",
"my_feature_1 my_feature_2",
"my_feature_1 my_feature_3",
"my_feature_1 my_feature_4",
"my_feature_2 my_feature_3",
"my_feature_2 my_feature_4",
"my_feature_3 my_feature_4",
]
assert result.columns.tolist() == ["Mean decrease impurity"]
Loading

0 comments on commit b01ea2c

Please sign in to comment.