Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(EstimatorReport): Display the mean decrease impurity #1368

87 changes: 76 additions & 11 deletions skore/src/skore/sklearn/_estimator/feature_importance_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from skore.externals._pandas_accessors import DirNamesMixin
from skore.sklearn._base import _BaseAccessor
from skore.sklearn._estimator.report import EstimatorReport
from skore.utils._accessor import _check_has_coef
from skore.utils._accessor import _check_has_coef, _check_has_feature_importances

DataSource = Literal["test", "train", "X_y"]

Expand Down Expand Up @@ -74,27 +74,31 @@ def coefficients(self) -> pd.DataFrame:
Feature #8 250.5...
Feature #9 99.5...
"""
estimator = self._parent.estimator_
parent_estimator = self._parent.estimator_

if isinstance(estimator, Pipeline):
feature_names = estimator[:-1].get_feature_names_out()
if isinstance(parent_estimator, Pipeline):
feature_names = parent_estimator[:-1].get_feature_names_out()
else:
if hasattr(estimator, "feature_names_in_"):
feature_names = estimator.feature_names_in_
if hasattr(parent_estimator, "feature_names_in_"):
feature_names = parent_estimator.feature_names_in_
else:
feature_names = [
f"Feature #{i}" for i in range(estimator.n_features_in_)
f"Feature #{i}" for i in range(parent_estimator.n_features_in_)
]

linear_model = estimator[-1] if isinstance(estimator, Pipeline) else estimator
intercept = np.atleast_2d(linear_model.intercept_)
coef = np.atleast_2d(linear_model.coef_)
estimator = (
parent_estimator[-1]
if isinstance(parent_estimator, Pipeline)
else parent_estimator
)
intercept = np.atleast_2d(estimator.intercept_)
coef = np.atleast_2d(estimator.coef_)

data = np.concatenate([intercept, coef.T])

if data.shape[1] == 1:
columns = ["Coefficient"]
elif is_classifier(estimator):
elif is_classifier(parent_estimator):
columns = [f"Class #{i}" for i in range(data.shape[1])]
else:
columns = [f"Target #{i}" for i in range(data.shape[1])]
Expand All @@ -107,6 +111,67 @@ def coefficients(self) -> pd.DataFrame:

return df

@available_if(_check_has_feature_importances())
def mean_decrease_impurity(self):
"""Retrieve the mean decrease impurity (MDI) of a tree-based model.

This method is available for estimators that expose a `feature_importances_`
attribute. See for example the
`sklearn.ensemble.GradientBoostingClassifier documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier.feature_importances_>`_.
In particular, note that the MDI is computed at fit time, i.e. using the
training data.

Examples
--------
>>> from sklearn.datasets import make_classification
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.model_selection import train_test_split
>>> from skore import EstimatorReport
>>> X, y = make_classification(n_features=5, random_state=42)
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
>>> forest = RandomForestClassifier(n_estimators=5, random_state=0)
>>> report = EstimatorReport(
... forest,
... X_train=X_train,
... y_train=y_train,
... X_test=X_test,
... y_test=y_test,
... )
>>> report.feature_importance.mean_decrease_impurity()
Mean decrease impurity
Feature #0 0.06...
Feature #1 0.19...
Feature #2 0.01...
Feature #3 0.69...
Feature #4 0.02...
"""
parent_estimator = self._parent.estimator_
estimator = (
parent_estimator.steps[-1][1]
if isinstance(parent_estimator, Pipeline)
else parent_estimator
)

data = estimator.feature_importances_

if isinstance(parent_estimator, Pipeline):
feature_names = parent_estimator[:-1].get_feature_names_out()
else:
if hasattr(parent_estimator, "feature_names_in_"):
feature_names = parent_estimator.feature_names_in_
else:
feature_names = [
f"Feature #{i}" for i in range(parent_estimator.n_features_in_)
]

df = pd.DataFrame(
data=data,
index=feature_names,
columns=["Mean decrease impurity"],
)

return df

def feature_permutation(
self,
*,
Expand Down
21 changes: 20 additions & 1 deletion skore/src/skore/utils/_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,26 @@ def check(accessor: Any) -> bool:
if hasattr(estimator, "coef_"):
return True
raise AttributeError(
f"Estimator {accessor._parent.estimator_} is not a supported estimator by "
f"Estimator {parent_estimator} is not a supported estimator by "
"the function called."
)

return check


def _check_has_feature_importances() -> Callable:
def check(accessor: Any) -> bool:
"""Check if the estimator has a `feature_importances_` attribute."""
parent_estimator = accessor._parent.estimator_
estimator = (
parent_estimator.steps[-1][1]
if isinstance(parent_estimator, Pipeline)
else parent_estimator
)
if hasattr(estimator, "feature_importances_"):
return True
raise AttributeError(
f"Estimator {parent_estimator} is not a supported estimator by "
"the function called."
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
import pandas as pd
import pytest
import sklearn
from sklearn.base import is_regressor
from sklearn.datasets import make_classification, make_regression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from skore import EstimatorReport


@pytest.mark.parametrize(
"data, estimator, expected_shape",
[
(
make_classification(n_features=5, random_state=42),
RandomForestClassifier(n_estimators=2, random_state=0),
(5, 1),
),
(
make_classification(n_features=5, random_state=42),
RandomForestClassifier(n_estimators=2, random_state=0),
(5, 1),
),
(
make_classification(
n_features=5,
n_classes=3,
n_samples=30,
n_informative=3,
random_state=42,
),
RandomForestClassifier(n_estimators=2, random_state=0),
(5, 1),
),
(
make_classification(
n_features=5,
n_classes=3,
n_samples=30,
n_informative=3,
random_state=42,
),
make_pipeline(
StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0)
),
(5, 1),
),
(
make_classification(n_features=5, random_state=42),
make_pipeline(
StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0)
),
(5, 1),
),
(
make_regression(n_features=5, n_targets=3, random_state=42),
RandomForestRegressor(n_estimators=2, random_state=0),
(5, 1),
),
],
)
def test_numpy_arrays(data, estimator, expected_shape):
X, y = data
estimator.fit(X, y)
report = EstimatorReport(estimator)
result = report.feature_importance.mean_decrease_impurity()

assert result.shape == expected_shape

expected_index = (
[f"x{i}" for i in range(X.shape[1])]
if isinstance(estimator, Pipeline)
else [f"Feature #{i}" for i in range(X.shape[1])]
)
assert result.index.tolist() == expected_index

expected_columns = ["Mean decrease impurity"]
assert result.columns.tolist() == expected_columns


@pytest.mark.parametrize(
"estimator",
[
RandomForestClassifier(n_estimators=2, random_state=0),
make_pipeline(
StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0)
),
],
)
def test_pandas_dataframe(estimator):
"""If provided, the `mean_decrease_impurity` dataframe uses the feature names."""
X, y = make_classification(n_features=5, random_state=42)
X = pd.DataFrame(X, columns=[f"my_feature_{i}" for i in range(X.shape[1])])
estimator.fit(X, y)

report = EstimatorReport(estimator)
result = report.feature_importance.mean_decrease_impurity()

assert result.shape == (5, 1)
assert result.index.tolist() == [
"my_feature_0",
"my_feature_1",
"my_feature_2",
"my_feature_3",
"my_feature_4",
]
assert result.columns.tolist() == ["Mean decrease impurity"]


def _make_estimator_param(estimator):
return pytest.param(estimator, id=estimator.__class__.__name__)


@pytest.mark.parametrize(
"estimator",
[
_make_estimator_param(estimator)
for estimator in [
sklearn.ensemble.AdaBoostClassifier(n_estimators=2),
sklearn.ensemble.AdaBoostRegressor(n_estimators=2),
sklearn.ensemble.ExtraTreesClassifier(n_estimators=2),
sklearn.ensemble.ExtraTreesRegressor(n_estimators=2),
sklearn.ensemble.GradientBoostingClassifier(n_estimators=2),
sklearn.ensemble.GradientBoostingRegressor(n_estimators=2),
sklearn.ensemble.RandomForestClassifier(n_estimators=2),
sklearn.ensemble.RandomForestRegressor(n_estimators=2),
sklearn.ensemble.RandomTreesEmbedding(n_estimators=2),
sklearn.tree.DecisionTreeClassifier(),
sklearn.tree.DecisionTreeRegressor(),
sklearn.tree.ExtraTreeClassifier(),
sklearn.tree.ExtraTreeRegressor(),
]
],
)
def test_all_sklearn_estimators(
request, estimator, regression_data, classification_data
):
"""Check that `mean_decrease_impurity` is supported for every sklearn estimator."""
if is_regressor(estimator):
X, y = regression_data
else:
X, y = classification_data

estimator.fit(X, y)

report = EstimatorReport(estimator)
result = report.feature_importance.mean_decrease_impurity()

assert result.shape == (5, 1)
assert result.index.tolist() == [
"Feature #0",
"Feature #1",
"Feature #2",
"Feature #3",
"Feature #4",
]
assert result.columns.tolist() == ["Mean decrease impurity"]


def test_pipeline_with_transformer(regression_data):
"""If the estimator is a pipeline containing a transformer that changes the
features, adapt the feature names in the output table."""
from sklearn.preprocessing import PolynomialFeatures

X, y = regression_data
X = pd.DataFrame(X, columns=[f"my_feature_{i}" for i in range(5)])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

model = make_pipeline(
PolynomialFeatures(degree=2, interaction_only=True),
RandomForestRegressor(n_estimators=2, random_state=0),
)

report = EstimatorReport(
model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test
)

result = report.feature_importance.mean_decrease_impurity()
assert result.shape == (16, 1)
assert result.index.tolist() == [
"1",
"my_feature_0",
"my_feature_1",
"my_feature_2",
"my_feature_3",
"my_feature_4",
"my_feature_0 my_feature_1",
"my_feature_0 my_feature_2",
"my_feature_0 my_feature_3",
"my_feature_0 my_feature_4",
"my_feature_1 my_feature_2",
"my_feature_1 my_feature_3",
"my_feature_1 my_feature_4",
"my_feature_2 my_feature_3",
"my_feature_2 my_feature_4",
"my_feature_3 my_feature_4",
]
assert result.columns.tolist() == ["Mean decrease impurity"]
Loading