diff --git a/skore/src/skore/sklearn/_estimator/feature_importance_accessor.py b/skore/src/skore/sklearn/_estimator/feature_importance_accessor.py index d6cb56dc2..47c40e3da 100644 --- a/skore/src/skore/sklearn/_estimator/feature_importance_accessor.py +++ b/skore/src/skore/sklearn/_estimator/feature_importance_accessor.py @@ -112,11 +112,13 @@ def coefficients(self) -> pd.DataFrame: @available_if(_check_has_feature_importances()) def mean_decrease_impurity(self): - """Retrieve the mean decrease impurity of a forest model. + """Retrieve the mean decrease impurity (MDI) of a tree-based model. This method is available for estimators that expose a `feature_importances_` attribute. See for example the `sklearn.ensemble.GradientBoostingClassifier documentation `_. + In particular, note that the MDI is computed at fit time, i.e. using the + training data. Examples -------- @@ -126,7 +128,7 @@ def mean_decrease_impurity(self): >>> from skore import EstimatorReport >>> X, y = make_classification(n_features=5, random_state=42) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - >>> forest = RandomForestClassifier(random_state=0) + >>> forest = RandomForestClassifier(n_estimators=5, random_state=0) >>> report = EstimatorReport( ... forest, ... X_train=X_train, diff --git a/skore/tests/unit/sklearn/estimator/feature_importance/test_mean_decrease_impurity.py b/skore/tests/unit/sklearn/estimator/feature_importance/test_mean_decrease_impurity.py index 1eaca48a9..0e65249d5 100644 --- a/skore/tests/unit/sklearn/estimator/feature_importance/test_mean_decrease_impurity.py +++ b/skore/tests/unit/sklearn/estimator/feature_importance/test_mean_decrease_impurity.py @@ -10,39 +10,17 @@ from skore import EstimatorReport -def test(classification_data): - X, y = classification_data - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) - - forest = RandomForestClassifier(random_state=0) - report = EstimatorReport( - forest, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test - ) - - result = report.feature_importance.mean_decrease_impurity() - - assert result.shape == (5, 1) - assert result.index.tolist() == [ - "Feature #0", - "Feature #1", - "Feature #2", - "Feature #3", - "Feature #4", - ] - assert result.columns.tolist() == ["Mean decrease impurity"] - - @pytest.mark.parametrize( "data, estimator, expected_shape", [ ( make_classification(n_features=5, random_state=42), - RandomForestClassifier(random_state=0), + RandomForestClassifier(n_estimators=2, random_state=0), (5, 1), ), ( make_classification(n_features=5, random_state=42), - RandomForestClassifier(random_state=0), + RandomForestClassifier(n_estimators=2, random_state=0), (5, 1), ), ( @@ -53,7 +31,7 @@ def test(classification_data): n_informative=3, random_state=42, ), - RandomForestClassifier(random_state=0), + RandomForestClassifier(n_estimators=2, random_state=0), (5, 1), ), ( @@ -64,17 +42,21 @@ def test(classification_data): n_informative=3, random_state=42, ), - make_pipeline(StandardScaler(), RandomForestClassifier(random_state=0)), + make_pipeline( + StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0) + ), (5, 1), ), ( make_classification(n_features=5, random_state=42), - make_pipeline(StandardScaler(), RandomForestClassifier(random_state=0)), + make_pipeline( + StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0) + ), (5, 1), ), ( make_regression(n_features=5, n_targets=3, random_state=42), - RandomForestRegressor(random_state=0), + RandomForestRegressor(n_estimators=2, random_state=0), (5, 1), ), ], @@ -101,8 +83,10 @@ def test_numpy_arrays(data, estimator, expected_shape): @pytest.mark.parametrize( "estimator", [ - RandomForestClassifier(random_state=0), - make_pipeline(StandardScaler(), RandomForestClassifier(random_state=0)), + RandomForestClassifier(n_estimators=2, random_state=0), + make_pipeline( + StandardScaler(), RandomForestClassifier(n_estimators=2, random_state=0) + ), ], ) def test_pandas_dataframe(estimator): @@ -134,15 +118,15 @@ def _make_estimator_param(estimator): [ _make_estimator_param(estimator) for estimator in [ - sklearn.ensemble.AdaBoostClassifier(), - sklearn.ensemble.AdaBoostRegressor(), - sklearn.ensemble.ExtraTreesClassifier(), - sklearn.ensemble.ExtraTreesRegressor(), - sklearn.ensemble.GradientBoostingClassifier(), - sklearn.ensemble.GradientBoostingRegressor(), - sklearn.ensemble.RandomForestClassifier(), - sklearn.ensemble.RandomForestRegressor(), - sklearn.ensemble.RandomTreesEmbedding(), + sklearn.ensemble.AdaBoostClassifier(n_estimators=2), + sklearn.ensemble.AdaBoostRegressor(n_estimators=2), + sklearn.ensemble.ExtraTreesClassifier(n_estimators=2), + sklearn.ensemble.ExtraTreesRegressor(n_estimators=2), + sklearn.ensemble.GradientBoostingClassifier(n_estimators=2), + sklearn.ensemble.GradientBoostingRegressor(n_estimators=2), + sklearn.ensemble.RandomForestClassifier(n_estimators=2), + sklearn.ensemble.RandomForestRegressor(n_estimators=2), + sklearn.ensemble.RandomTreesEmbedding(n_estimators=2), sklearn.tree.DecisionTreeClassifier(), sklearn.tree.DecisionTreeRegressor(), sklearn.tree.ExtraTreeClassifier(), @@ -188,7 +172,7 @@ def test_pipeline_with_transformer(regression_data): model = make_pipeline( PolynomialFeatures(degree=2, interaction_only=True), - RandomForestRegressor(random_state=0), + RandomForestRegressor(n_estimators=2, random_state=0), ) report = EstimatorReport(