diff --git a/skore/src/skore/sklearn/_estimator/feature_importance_accessor.py b/skore/src/skore/sklearn/_estimator/feature_importance_accessor.py index 0fa37ace0..c314ebae1 100644 --- a/skore/src/skore/sklearn/_estimator/feature_importance_accessor.py +++ b/skore/src/skore/sklearn/_estimator/feature_importance_accessor.py @@ -13,7 +13,7 @@ from skore.externals._pandas_accessors import DirNamesMixin from skore.sklearn._base import _BaseAccessor from skore.sklearn._estimator.report import EstimatorReport -from skore.utils._accessor import _check_has_coef +from skore.utils._accessor import _check_has_coef, _check_has_feature_importances DataSource = Literal["test", "train", "X_y"] @@ -74,27 +74,31 @@ def coefficients(self) -> pd.DataFrame: Feature #8 250.5... Feature #9 99.5... """ - estimator = self._parent.estimator_ + parent_estimator = self._parent.estimator_ - if isinstance(estimator, Pipeline): - feature_names = estimator[:-1].get_feature_names_out() + if isinstance(parent_estimator, Pipeline): + feature_names = parent_estimator[:-1].get_feature_names_out() else: - if hasattr(estimator, "feature_names_in_"): - feature_names = estimator.feature_names_in_ + if hasattr(parent_estimator, "feature_names_in_"): + feature_names = parent_estimator.feature_names_in_ else: feature_names = [ - f"Feature #{i}" for i in range(estimator.n_features_in_) + f"Feature #{i}" for i in range(parent_estimator.n_features_in_) ] - linear_model = estimator[-1] if isinstance(estimator, Pipeline) else estimator - intercept = np.atleast_2d(linear_model.intercept_) - coef = np.atleast_2d(linear_model.coef_) + estimator = ( + parent_estimator[-1] + if isinstance(parent_estimator, Pipeline) + else parent_estimator + ) + intercept = np.atleast_2d(estimator.intercept_) + coef = np.atleast_2d(estimator.coef_) data = np.concatenate([intercept, coef.T]) if data.shape[1] == 1: columns = ["Coefficient"] - elif is_classifier(estimator): + elif is_classifier(parent_estimator): columns = [f"Class #{i}" for i in range(data.shape[1])] else: columns = [f"Target #{i}" for i in range(data.shape[1])] @@ -107,6 +111,67 @@ def coefficients(self) -> pd.DataFrame: return df + @available_if(_check_has_feature_importances()) + def mean_decrease_impurity(self): + """Retrieve the mean decrease impurity (MDI) of a tree-based model. + + This method is available for estimators that expose a `feature_importances_` + attribute. See for example the + `sklearn.ensemble.GradientBoostingClassifier documentation `_. + In particular, note that the MDI is computed at fit time, i.e. using the + training data. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.model_selection import train_test_split + >>> from skore import EstimatorReport + >>> X, y = make_classification(n_features=5, random_state=42) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + >>> forest = RandomForestClassifier(n_estimators=5, random_state=0) + >>> report = EstimatorReport( + ... forest, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> report.feature_importance.mean_decrease_impurity() + Mean decrease impurity + Feature #0 0.06... + Feature #1 0.19... + Feature #2 0.01... + Feature #3 0.69... + Feature #4 0.02... + """ + parent_estimator = self._parent.estimator_ + estimator = ( + parent_estimator.steps[-1][1] + if isinstance(parent_estimator, Pipeline) + else parent_estimator + ) + + data = estimator.feature_importances_ + + if isinstance(parent_estimator, Pipeline): + feature_names = parent_estimator[:-1].get_feature_names_out() + else: + if hasattr(parent_estimator, "feature_names_in_"): + feature_names = parent_estimator.feature_names_in_ + else: + feature_names = [ + f"Feature #{i}" for i in range(parent_estimator.n_features_in_) + ] + + df = pd.DataFrame( + data=data, + index=feature_names, + columns=["Mean decrease impurity"], + ) + + return df + def feature_permutation( self, *, diff --git a/skore/src/skore/utils/_accessor.py b/skore/src/skore/utils/_accessor.py index e8d9f31c5..dcacb1469 100644 --- a/skore/src/skore/utils/_accessor.py +++ b/skore/src/skore/utils/_accessor.py @@ -32,7 +32,26 @@ def check(accessor: Any) -> bool: if hasattr(estimator, "coef_"): return True raise AttributeError( - f"Estimator {accessor._parent.estimator_} is not a supported estimator by " + f"Estimator {parent_estimator} is not a supported estimator by " + "the function called." + ) + + return check + + +def _check_has_feature_importances() -> Callable: + def check(accessor: Any) -> bool: + """Check if the estimator has a `feature_importances_` attribute.""" + parent_estimator = accessor._parent.estimator_ + estimator = ( + parent_estimator.steps[-1][1] + if isinstance(parent_estimator, Pipeline) + else parent_estimator + ) + if hasattr(estimator, "feature_importances_"): + return True + raise AttributeError( + f"Estimator {parent_estimator} is not a supported estimator by " "the function called." ) diff --git a/skore/tests/unit/sklearn/estimator/feature_importance/test_mean_decrease_impurity.py b/skore/tests/unit/sklearn/estimator/feature_importance/test_mean_decrease_impurity.py new file mode 100644 index 000000000..12a9ee3f9 --- /dev/null +++ b/skore/tests/unit/sklearn/estimator/feature_importance/test_mean_decrease_impurity.py @@ -0,0 +1,201 @@ +import pandas as pd +import pytest +import sklearn +from sklearn.base import is_regressor +from sklearn.datasets import make_classification, make_regression +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import StandardScaler +from skore import EstimatorReport + + +@pytest.mark.parametrize( + "data, estimator, expected_shape", + [ + ( + make_classification(n_features=5, random_state=42), + RandomForestClassifier(n_estimators=2, random_state=0), + (5, 1), + ), + ( + make_classification(n_features=5, random_state=42), + RandomForestClassifier(n_estimators=2, random_state=0), + (5, 1), + ), + ( + make_classification( + n_features=5, + n_classes=3, + n_samples=30, + n_informative=3, + random_state=42, + ), + RandomForestClassifier(n_estimators=2, random_state=0), + (5, 1), + ), + ( + make_classification( + n_features=5, + n_classes=3, + n_samples=30, + n_informative=3, + random_state=42, + ), + make_pipeline( + StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0) + ), + (5, 1), + ), + ( + make_classification(n_features=5, random_state=42), + make_pipeline( + StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0) + ), + (5, 1), + ), + ( + make_regression(n_features=5, n_targets=3, random_state=42), + RandomForestRegressor(n_estimators=2, random_state=0), + (5, 1), + ), + ], +) +def test_numpy_arrays(data, estimator, expected_shape): + X, y = data + estimator.fit(X, y) + report = EstimatorReport(estimator) + result = report.feature_importance.mean_decrease_impurity() + + assert result.shape == expected_shape + + expected_index = ( + [f"x{i}" for i in range(X.shape[1])] + if isinstance(estimator, Pipeline) + else [f"Feature #{i}" for i in range(X.shape[1])] + ) + assert result.index.tolist() == expected_index + + expected_columns = ["Mean decrease impurity"] + assert result.columns.tolist() == expected_columns + + +@pytest.mark.parametrize( + "estimator", + [ + RandomForestClassifier(n_estimators=2, random_state=0), + make_pipeline( + StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0) + ), + ], +) +def test_pandas_dataframe(estimator): + """If provided, the `mean_decrease_impurity` dataframe uses the feature names.""" + X, y = make_classification(n_features=5, random_state=42) + X = pd.DataFrame(X, columns=[f"my_feature_{i}" for i in range(X.shape[1])]) + estimator.fit(X, y) + + report = EstimatorReport(estimator) + result = report.feature_importance.mean_decrease_impurity() + + assert result.shape == (5, 1) + assert result.index.tolist() == [ + "my_feature_0", + "my_feature_1", + "my_feature_2", + "my_feature_3", + "my_feature_4", + ] + assert result.columns.tolist() == ["Mean decrease impurity"] + + +def _make_estimator_param(estimator): + return pytest.param(estimator, id=estimator.__class__.__name__) + + +@pytest.mark.parametrize( + "estimator", + [ + _make_estimator_param(estimator) + for estimator in [ + sklearn.ensemble.AdaBoostClassifier(n_estimators=2), + sklearn.ensemble.AdaBoostRegressor(n_estimators=2), + sklearn.ensemble.ExtraTreesClassifier(n_estimators=2), + sklearn.ensemble.ExtraTreesRegressor(n_estimators=2), + sklearn.ensemble.GradientBoostingClassifier(n_estimators=2), + sklearn.ensemble.GradientBoostingRegressor(n_estimators=2), + sklearn.ensemble.RandomForestClassifier(n_estimators=2), + sklearn.ensemble.RandomForestRegressor(n_estimators=2), + sklearn.ensemble.RandomTreesEmbedding(n_estimators=2), + sklearn.tree.DecisionTreeClassifier(), + sklearn.tree.DecisionTreeRegressor(), + sklearn.tree.ExtraTreeClassifier(), + sklearn.tree.ExtraTreeRegressor(), + ] + ], +) +def test_all_sklearn_estimators( + request, estimator, regression_data, classification_data +): + """Check that `mean_decrease_impurity` is supported for every sklearn estimator.""" + if is_regressor(estimator): + X, y = regression_data + else: + X, y = classification_data + + estimator.fit(X, y) + + report = EstimatorReport(estimator) + result = report.feature_importance.mean_decrease_impurity() + + assert result.shape == (5, 1) + assert result.index.tolist() == [ + "Feature #0", + "Feature #1", + "Feature #2", + "Feature #3", + "Feature #4", + ] + assert result.columns.tolist() == ["Mean decrease impurity"] + + +def test_pipeline_with_transformer(regression_data): + """If the estimator is a pipeline containing a transformer that changes the + features, adapt the feature names in the output table.""" + from sklearn.preprocessing import PolynomialFeatures + + X, y = regression_data + X = pd.DataFrame(X, columns=[f"my_feature_{i}" for i in range(5)]) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + model = make_pipeline( + PolynomialFeatures(degree=2, interaction_only=True), + RandomForestRegressor(n_estimators=2, random_state=0), + ) + + report = EstimatorReport( + model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test + ) + + result = report.feature_importance.mean_decrease_impurity() + assert result.shape == (16, 1) + assert result.index.tolist() == [ + "1", + "my_feature_0", + "my_feature_1", + "my_feature_2", + "my_feature_3", + "my_feature_4", + "my_feature_0 my_feature_1", + "my_feature_0 my_feature_2", + "my_feature_0 my_feature_3", + "my_feature_0 my_feature_4", + "my_feature_1 my_feature_2", + "my_feature_1 my_feature_3", + "my_feature_1 my_feature_4", + "my_feature_2 my_feature_3", + "my_feature_2 my_feature_4", + "my_feature_3 my_feature_4", + ] + assert result.columns.tolist() == ["Mean decrease impurity"] diff --git a/skore/tests/unit/utils/test_accessors.py b/skore/tests/unit/utils/test_accessors.py index c54044656..98d7c8429 100644 --- a/skore/tests/unit/utils/test_accessors.py +++ b/skore/tests/unit/utils/test_accessors.py @@ -1,7 +1,11 @@ import pytest from sklearn.pipeline import make_pipeline from skore.externals._pandas_accessors import DirNamesMixin, _register_accessor -from skore.utils._accessor import _check_has_coef, _check_supported_ml_task +from skore.utils._accessor import ( + _check_has_coef, + _check_has_feature_importances, + _check_supported_ml_task, +) def test_register_accessor(): @@ -94,3 +98,38 @@ def __init__(self): err_msg = "Estimator hello is not a supported estimator by the function called." with pytest.raises(AttributeError, match=err_msg): assert _check_has_coef()(accessor) + + +def test_check_has_feature_importance(): + """ + Test that only estimators with the `feature_importances_` attribute are accepted. + """ + + class MockParent: + def __init__(self, estimator): + self.estimator_ = estimator + + class MockAccessor: + def __init__(self, parent): + self._parent = parent + + class Estimator: + def __init__(self): + self.feature_importances_ = 0 + + parent = MockParent(Estimator()) + accessor = MockAccessor(parent) + + assert _check_has_feature_importances()(accessor) + + parent = MockParent(make_pipeline(Estimator())) + accessor = MockAccessor(parent) + + assert _check_has_feature_importances()(accessor) + + parent = MockParent(estimator="hello") + accessor = MockAccessor(parent) + + err_msg = "Estimator hello is not a supported estimator by the function called." + with pytest.raises(AttributeError, match=err_msg): + assert _check_has_feature_importances()(accessor) diff --git a/sphinx/reference/report/estimator_report.rst b/sphinx/reference/report/estimator_report.rst index f2919a49c..f2dd54a50 100644 --- a/sphinx/reference/report/estimator_report.rst +++ b/sphinx/reference/report/estimator_report.rst @@ -76,4 +76,5 @@ used to train your estimator. EstimatorReport.feature_importance.help EstimatorReport.feature_importance.coefficients + EstimatorReport.feature_importance.mean_decrease_impurity EstimatorReport.feature_importance.feature_permutation