From a4f9014ea9795424039ac66ae0b163a42683103b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 7 Feb 2025 23:34:22 +0100 Subject: [PATCH 1/9] feat(api): Allow to flatten index in reports --- .../_cross_validation/metrics_accessor.py | 21 +++- .../sklearn/_estimator/metrics_accessor.py | 21 +++- skore/src/skore/utils/_index.py | 66 +++++++++++ skore/tests/unit/utils/test_index.py | 108 ++++++++++++++++++ 4 files changed, 206 insertions(+), 10 deletions(-) create mode 100644 skore/src/skore/utils/_index.py create mode 100644 skore/tests/unit/utils/test_index.py diff --git a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py index 99219eca8..4ad1c096f 100644 --- a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py +++ b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py @@ -11,6 +11,7 @@ RocCurveDisplay, ) from skore.utils._accessor import _check_supported_ml_task +from skore.utils._index import flatten_multiindex from skore.utils._progress_bar import progress_decorator ############################################################################### @@ -48,9 +49,10 @@ def report_metrics( data_source="test", scoring=None, scoring_names=None, - pos_label=None, scoring_kwargs=None, + pos_label=None, aggregate=None, + flat_index=False, ): """Report a set of metrics for our estimator. @@ -75,15 +77,18 @@ def report_metrics( Used to overwrite the default scoring names in the report. It should be of the same length as the `scoring` parameter. - pos_label : int, float, bool or str, default=None - The positive class. - scoring_kwargs : dict, default=None The keyword arguments to pass to the scoring functions. + pos_label : int, float, bool or str, default=None + The positive class. + aggregate : {"mean", "std"} or list of such str, default=None Function to aggregate the scores across the cross-validation splits. + flat_index : bool, default=False + Whether to flatten the multiindex columns. + Returns ------- pd.DataFrame @@ -104,7 +109,7 @@ def report_metrics( LogisticRegression mean 0.94... 0.96... std 0.02... 0.02... """ - return self._compute_metric_scores( + results = self._compute_metric_scores( report_metric_name="report_metrics", data_source=data_source, aggregate=aggregate, @@ -113,6 +118,12 @@ def report_metrics( scoring_kwargs=scoring_kwargs, scoring_names=scoring_names, ) + if flat_index: + if isinstance(results.columns, pd.MultiIndex): + results.columns = flatten_multiindex(results.columns) + if isinstance(results.index, pd.MultiIndex): + results.index = flatten_multiindex(results.index) + return results @progress_decorator(description="Compute metric for each split") def _compute_metric_scores( diff --git a/skore/src/skore/sklearn/_estimator/metrics_accessor.py b/skore/src/skore/sklearn/_estimator/metrics_accessor.py index b2e5f1f6e..bd2049add 100644 --- a/skore/src/skore/sklearn/_estimator/metrics_accessor.py +++ b/skore/src/skore/sklearn/_estimator/metrics_accessor.py @@ -16,6 +16,7 @@ RocCurveDisplay, ) from skore.utils._accessor import _check_supported_ml_task +from skore.utils._index import flatten_multiindex ############################################################################### # Metrics accessor @@ -53,8 +54,9 @@ def report_metrics( y=None, scoring=None, scoring_names=None, - pos_label=None, scoring_kwargs=None, + pos_label=None, + flat_index=False, ): """Report a set of metrics for our estimator. @@ -88,11 +90,14 @@ def report_metrics( Used to overwrite the default scoring names in the report. It should be of the same length as the `scoring` parameter. + scoring_kwargs : dict, default=None + The keyword arguments to pass to the scoring functions. + pos_label : int, float, bool or str, default=None The positive class. - scoring_kwargs : dict, default=None - The keyword arguments to pass to the scoring functions. + flat_index : bool, default=False + Whether to flatten the multiindex columns. Returns ------- @@ -116,7 +121,7 @@ def report_metrics( ... X_test=X_test, ... y_test=y_test, ... ) - >>> report.metrics.report_metrics(pos_label=1) + >>> report.metrics.report_metrics(pos_label=1, flatten_multiindex=False) Metric Precision (↗︎) Recall (↗︎) ROC AUC (↗︎) Brier score (↘︎) LogisticRegression 0.98... 0.93... 0.99... 0.03... """ @@ -265,7 +270,13 @@ def report_metrics( names=name_index, ) - return pd.concat(scores, axis=1) + results = pd.concat(scores, axis=1) + if flat_index: + if isinstance(results.columns, pd.MultiIndex): + results.columns = flatten_multiindex(results.columns) + if isinstance(results.index, pd.MultiIndex): + results.index = flatten_multiindex(results.index) + return results def _compute_metric_scores( self, diff --git a/skore/src/skore/utils/_index.py b/skore/src/skore/utils/_index.py new file mode 100644 index 000000000..c91b7d3f8 --- /dev/null +++ b/skore/src/skore/utils/_index.py @@ -0,0 +1,66 @@ +import pandas as pd + + +def flatten_multiindex(index: pd.MultiIndex) -> pd.Index: + """Flatten a pandas MultiIndex into a single-level Index. + + Flatten a pandas MultiIndex into a single-level Index by joining the levels + with underscores. Empty strings are skipped when joining. + + Parameters + ---------- + index : pandas.MultiIndex + The `MultiIndex` to flatten. + + Returns + ------- + pandas.Index + A flattened `Index` with non-empty levels joined by underscores. + + Examples + -------- + >>> import pandas as pd + >>> mi = pd.MultiIndex.from_tuples( + ... [('a', ''), ('b', '2')], names=['letter', 'number'] + ... ) + >>> flatten_multiindex(mi) + Index(['a', 'b_2'], dtype='object') + """ + if not isinstance(index, pd.MultiIndex): + raise ValueError("`index` must be a MultiIndex.") + + return pd.Index(["_".join(filter(bool, map(str, values))) for values in index]) + + +def unflatten_index(index: pd.Index, names: list[str] | None = None) -> pd.MultiIndex: + """Create a MultiIndex from a flat Index with underscore-separated values. + + Convert a flat `Index` with underscore-separated values into a `MultiIndex`. + + Parameters + ---------- + index : pandas.Index + The flat Index with values separated by underscores. + names : list of str, optional + Names for the levels in the resulting MultiIndex. If None, levels will + be unnamed. + + Returns + ------- + pandas.MultiIndex + A MultiIndex with separate levels for each underscore-separated component. + + Examples + -------- + >>> import pandas as pd + >>> flat_idx = pd.Index(['a_1', 'b_2']) + >>> unflatten_index(flat_idx, names=['letter', 'number']) + MultiIndex([('a', '1'), + ('b', '2')], + names=['letter', 'number']) + """ + if isinstance(index, pd.MultiIndex): + raise ValueError("`index` must be a flat Index.") + + tuples = [tuple(val.split("_")) for val in index] + return pd.MultiIndex.from_tuples(tuples, names=names) diff --git a/skore/tests/unit/utils/test_index.py b/skore/tests/unit/utils/test_index.py new file mode 100644 index 000000000..7559b1b23 --- /dev/null +++ b/skore/tests/unit/utils/test_index.py @@ -0,0 +1,108 @@ +import pandas as pd +import pytest +from skore.utils._index import flatten_multiindex, unflatten_index + + +@pytest.mark.parametrize( + "input_tuples, names, expected_values", + [ + pytest.param( + [("a", 1), ("b", 2)], ["letter", "number"], ["a_1", "b_2"], id="basic" + ), + pytest.param( + [("a", 1, "x"), ("b", 2, "y")], + ["letter", "number", "symbol"], + ["a_1_x", "b_2_y"], + id="multiple_levels", + ), + pytest.param( + [("a", None), (None, 2)], + ["letter", "number"], + ["a_nan", "nan_2.0"], + id="none_values", + ), + pytest.param( + [("a@b", "1#2"), ("c&d", "3$4")], + ["letter", "number"], + ["a@b_1#2", "c&d_3$4"], + id="special_chars", + ), + pytest.param([], ["letter", "number"], [], id="empty"), + ], +) +def test_flatten_multiindex(input_tuples, names, expected_values): + """Test flatten_multiindex with various input cases.""" + mi = pd.MultiIndex.from_tuples(input_tuples, names=names) + result = flatten_multiindex(mi) + expected = pd.Index(expected_values) + pd.testing.assert_index_equal(result, expected) + + +def test_flatten_multiindex_invalid_input(): + """Test that non-MultiIndex input raises ValueError.""" + simple_index = pd.Index(["a", "b"]) + with pytest.raises(ValueError, match="`index` must be a MultiIndex."): + flatten_multiindex(simple_index) + + +@pytest.mark.parametrize( + "input_values, names, expected_tuples", + [ + pytest.param( + ["a_1", "b_2"], ["letter", "number"], [("a", "1"), ("b", "2")], id="basic" + ), + pytest.param( + ["a_1_x", "b_2_y"], + ["letter", "number", "symbol"], + [("a", "1", "x"), ("b", "2", "y")], + id="multiple_components", + ), + pytest.param( + ["a_1", "b_2"], None, [("a", "1"), ("b", "2")], id="without_names" + ), + pytest.param( + ["a@b_1#2", "c&d_3$4"], + ["letter", "number"], + [("a@b", "1#2"), ("c&d", "3$4")], + id="special_chars", + ), + pytest.param([], ["letter", "number"], [], id="empty"), + ], +) +def test_unflatten_index(input_values, names, expected_tuples): + """Test unflatten_index with various input cases.""" + flat_idx = pd.Index(input_values) + result = unflatten_index(flat_idx, names=names) + expected = pd.MultiIndex.from_tuples(expected_tuples, names=names) + pd.testing.assert_index_equal(result, expected) + + +def test_unflatten_index_invalid_input(): + """Test that MultiIndex input raises ValueError.""" + mi = pd.MultiIndex.from_tuples([("a", "1"), ("b", "2")]) + with pytest.raises(ValueError, match="`index` must be a flat Index."): + unflatten_index(mi) + + +@pytest.mark.parametrize( + "input_values, names, expected_names", + [ + pytest.param( + ["a_1", "b_2"], + ["letter", "number"], + ["letter", "number"], + id="matching_names", + ), + pytest.param( + ["a_1_x", "b_2_y"], + ["level0", "level1", "level2"], + ["level0", "level1", "level2"], + id="three_component_names", + ), + ], +) +def test_unflatten_index_mismatched_names(input_values, names, expected_names): + """Test unflatten_index with mismatched number of names.""" + flat_idx = pd.Index(input_values) + result = unflatten_index(flat_idx, names=names) + assert result.names == expected_names From 6736c00d579766ad747903174eb0dfcb93b10ec7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 18 Feb 2025 23:26:27 +0100 Subject: [PATCH 2/9] TST add tests --- .../sklearn/_estimator/metrics_accessor.py | 2 +- .../unit/sklearn/test_cross_validation.py | 25 +++++++++++++++++++ skore/tests/unit/sklearn/test_estimator.py | 22 ++++++++++++++++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/skore/src/skore/sklearn/_estimator/metrics_accessor.py b/skore/src/skore/sklearn/_estimator/metrics_accessor.py index 2f41980b8..d39f9157e 100644 --- a/skore/src/skore/sklearn/_estimator/metrics_accessor.py +++ b/skore/src/skore/sklearn/_estimator/metrics_accessor.py @@ -331,7 +331,7 @@ def report_metrics( names=name_index, ) - results = pd.concat(scores, axis=1) + results = pd.concat(scores, axis=0) if flat_index: if isinstance(results.columns, pd.MultiIndex): results.columns = flatten_multi_index(results.columns) diff --git a/skore/tests/unit/sklearn/test_cross_validation.py b/skore/tests/unit/sklearn/test_cross_validation.py index 381cc9cc1..532b2e4ac 100644 --- a/skore/tests/unit/sklearn/test_cross_validation.py +++ b/skore/tests/unit/sklearn/test_cross_validation.py @@ -219,6 +219,31 @@ def test_cross_validation_report_pickle(tmp_path, binary_classification_data): joblib.dump(report, tmp_path / "report.joblib") +def test_cross_validation_report_flat_index(binary_classification_data): + """Check that the index is flattened when `flat_index` is True. + + Since `pos_label` is None, then by default a MultiIndex would be returned. + Here, we force to have a single-index by passing `flat_index=True`. + """ + estimator, X, y = binary_classification_data + report = CrossValidationReport(estimator, X=X, y=y, cv_splitter=2) + result = report.metrics.report_metrics(flat_index=True) + assert result.shape == (6, 2) + assert isinstance(result.index, pd.Index) + assert result.index.tolist() == [ + "Precision (↗︎)_0", + "Precision (↗︎)_1", + "Recall (↗︎)_0", + "Recall (↗︎)_1", + "ROC AUC (↗︎)", + "Brier score (↘︎)", + ] + assert result.columns.tolist() == [ + "RandomForestClassifier_Split #0", + "RandomForestClassifier_Split #1", + ] + + ######################################################################################## # Check the plot methods ######################################################################################## diff --git a/skore/tests/unit/sklearn/test_estimator.py b/skore/tests/unit/sklearn/test_estimator.py index 24f0aae6a..4fc422b40 100644 --- a/skore/tests/unit/sklearn/test_estimator.py +++ b/skore/tests/unit/sklearn/test_estimator.py @@ -347,6 +347,28 @@ def test_estimator_report_pickle(tmp_path, binary_classification_data): joblib.dump(report, tmp_path / "report.joblib") +def test_estimator_report_flat_index(binary_classification_data): + """Check that the index is flattened when `flat_index` is True. + + Since `pos_label` is None, then by default a MultiIndex would be returned. + Here, we force to have a single-index by passing `flat_index=True`. + """ + estimator, X_test, y_test = binary_classification_data + report = EstimatorReport(estimator, X_test=X_test, y_test=y_test) + result = report.metrics.report_metrics(flat_index=True) + assert result.shape == (6, 1) + assert isinstance(result.index, pd.Index) + assert result.index.tolist() == [ + "Precision (↗︎)_0", + "Precision (↗︎)_1", + "Recall (↗︎)_0", + "Recall (↗︎)_1", + "ROC AUC (↗︎)", + "Brier score (↘︎)", + ] + assert result.columns.tolist() == ["RandomForestClassifier"] + + ######################################################################################## # Check the plot methods ######################################################################################## From 30ed3353c6eea1b6fa8a6b008b7e842945d2cb6e Mon Sep 17 00:00:00 2001 From: "Thomas S." Date: Wed, 19 Feb 2025 16:17:34 +0100 Subject: [PATCH 3/9] feat: Add `ComparisonReport` to compare instances of `EstimatorReport` (#1286) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - [x] Rename to `ComparisonReport` - [x] Rebase on top of #1239 and adapt - [x] Raise if `report.metrics.accuracy(data_source="train")` is called with at least one EstimatorReport that does not have training data - [x] Test - [x] Docstrings - [x] MetricsAccessor - [x] Move index column "#0" in front of each metric - [x] Pass report names in comparator - [ ] ~Update plots legend~ see #1309 - The actual `RocCurveDisplay` needs a full refactor to be splitted by use-case: estimator report, cross-validation report and finally comparison report. In each of these use-cases, there is two scenarios with binary classification and multi-class classification. Otherwise, it will be unmaintainable. - [ ] ~Investigate missing metrics in `report_metrics`~ **(deferred to future PR)** - The logic is split between `report_metrics` and `available_if`; it should be merged (ideally everything in `available_if`?) - [ ] ~Refactor to make `CrossValidationReport` depend on it~ **(deferred to future PR)** - [x] ~Change EstimatorReport `repr`?~ Issue https://github.com/probabl-ai/skore/issues/1293 Closes #1245 Co-authored-by: Auguste Co-authored-by: Sylvain Combettes <48064216+sylvaincom@users.noreply.github.com> --- README.md | 8 +- .../plot_skore_getting_started.py | 68 +- examples/use_cases/plot_employee_salaries.py | 5 + skore/src/skore/__init__.py | 2 + skore/src/skore/sklearn/__init__.py | 2 + skore/src/skore/sklearn/_base.py | 2 +- .../src/skore/sklearn/_comparison/__init__.py | 7 + .../sklearn/_comparison/metrics_accessor.py | 1080 +++++++++++++++++ skore/src/skore/sklearn/_comparison/report.py | 170 +++ .../_cross_validation/metrics_accessor.py | 19 +- .../skore/sklearn/_cross_validation/report.py | 3 +- .../sklearn/_estimator/metrics_accessor.py | 22 +- skore/tests/unit/sklearn/test_comparison.py | 536 ++++++++ skore/tests/unit/sklearn/test_estimator.py | 7 +- sphinx/api/skore.config_context.rst | 10 + sphinx/api/skore.get_config.rst | 10 + sphinx/api/skore.set_config.rst | 10 + sphinx/index.rst | 4 +- sphinx/reference/report/comparison_report.rst | 50 + sphinx/reference/report/index.rst | 12 + 20 files changed, 1986 insertions(+), 41 deletions(-) create mode 100644 skore/src/skore/sklearn/_comparison/__init__.py create mode 100644 skore/src/skore/sklearn/_comparison/metrics_accessor.py create mode 100644 skore/src/skore/sklearn/_comparison/report.py create mode 100644 skore/tests/unit/sklearn/test_comparison.py create mode 100644 sphinx/api/skore.config_context.rst create mode 100644 sphinx/api/skore.get_config.rst create mode 100644 sphinx/api/skore.set_config.rst create mode 100644 sphinx/reference/report/comparison_report.rst diff --git a/README.md b/README.md index dfa9fca31..6248eaa3d 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,8 @@ skore is a Python open-source library designed to help data scientists apply rec - `train_test_split` supercharged with methodological guidance: the API is the same as scikit-learn's, but skore displays warnings when applicable. For example, it warns you against shuffling time series data or when you have class imbalance. - **Evaluate**: automated insightful reports. - `EstimatorReport`: feed your scikit-learn compatible estimator and dataset, and it generates recommended metrics and plots to help you analyze your estimator. All these are computed and generated for you in 1 line of code. Under the hood, we use efficient caching to make the computations blazing fast. - - `CrossValidationReport`: Get a skore estimator report for each fold of your cross-validation. + - `CrossValidationReport`: get a skore estimator report for each fold of your cross-validation. + - `ComparisonReport`: benchmark your skore estimator reports. ## What's next? @@ -91,7 +92,7 @@ You can find information on the latest version [here](https://anaconda.org/conda ```python # Display the ROC curve that was generated for you: roc_plot = cv_report.metrics.roc() - roc_plot + roc_plot.plot() ``` 1. Store your results for safe-keeping. @@ -109,7 +110,8 @@ You can find information on the latest version [here](https://anaconda.org/conda ```python # Get your results - df_get = my_project.put("df_cv_report_metrics") + df_get = my_project.get("df_cv_report_metrics") + df_get ``` Learn more in our [documentation](https://skore.probabl.ai). diff --git a/examples/getting_started/plot_skore_getting_started.py b/examples/getting_started/plot_skore_getting_started.py index ce92bbfab..1be7e1c0b 100644 --- a/examples/getting_started/plot_skore_getting_started.py +++ b/examples/getting_started/plot_skore_getting_started.py @@ -17,6 +17,8 @@ # * :class:`skore.CrossValidationReport`: get an insightful report on your # cross-validation results # +# * :class:`skore.ComparisonReport`: benchmark your skore estimator reports +# # * :func:`skore.train_test_split`: get diagnostics when splitting your data # # #. Track your ML/DS results using skore's :class:`~skore.Project` @@ -50,10 +52,10 @@ X, y = make_classification(n_classes=2, n_samples=100_000, n_informative=4) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) -clf = LogisticRegression(random_state=0) +log_reg = LogisticRegression(random_state=0) -est_report = EstimatorReport( - clf, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test +log_reg_report = EstimatorReport( + log_reg, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test ) # %% @@ -61,14 +63,14 @@ # (skore detected that we are doing binary classification): # %% -est_report.help() +log_reg_report.help() # %% # We can get the report metrics that was computed for us: # %% -df_est_report_metrics = est_report.metrics.report_metrics() -df_est_report_metrics +df_log_reg_report_metrics = log_reg_report.metrics.report_metrics() +df_log_reg_report_metrics # %% # We can also plot the ROC curve that was generated for us: @@ -76,7 +78,7 @@ # %% import matplotlib.pyplot as plt -roc_plot = est_report.metrics.roc() +roc_plot = log_reg_report.metrics.roc() roc_plot.plot() plt.tight_layout() @@ -97,7 +99,7 @@ # %% from skore import CrossValidationReport -cv_report = CrossValidationReport(clf, X, y, cv_splitter=5) +cv_report = CrossValidationReport(log_reg, X, y, cv_splitter=5) # %% # We display the cross-validation report helper: @@ -125,9 +127,9 @@ # for example the first fold: # %% -est_report_fold = cv_report.estimator_reports_[0] -df_report_metrics_fold = est_report_fold.metrics.report_metrics() -df_report_metrics_fold +log_reg_report_fold = cv_report.estimator_reports_[0] +df_log_reg_report_fold_metrics = log_reg_report_fold.metrics.report_metrics() +df_log_reg_report_fold_metrics # %% # .. seealso:: @@ -135,6 +137,50 @@ # For more information about the motivation and usage of # :class:`skore.CrossValidationReport`, see :ref:`example_use_case_employee_salaries`. +# %% +# Comparing estimators reports +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# :class:`skore.ComparisonReport` enables users to compare several estimator reports +# (corresponding to several estimators) on a same test set, as in a benchmark of +# estimators. +# +# Apart from the previous ``log_reg_report``, let use define another estimator report: + +# %% +from sklearn.ensemble import RandomForestClassifier + +rf = RandomForestClassifier(max_depth=2, random_state=0) +rf_report = EstimatorReport( + rf, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test +) + +# %% +# Now, let us compare these two estimator reports, that were applied to the exact +# same test set: + +# %% +from skore import ComparisonReport + +comparator = ComparisonReport(reports=[log_reg_report, rf_report]) + +# %% +# As for the :class:`~skore.EstimatorReport` and the +# :class:`~skore.CrossValidationReport`, we have a helper: + +# %% +comparator.help() + +# %% +# Let us display the result of our benchmark: + +# %% +benchmark_metrics = comparator.metrics.report_metrics() +benchmark_metrics + +# %% +# We have the result of our benchmark. + # %% # Train-test split with skore # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/examples/use_cases/plot_employee_salaries.py b/examples/use_cases/plot_employee_salaries.py index 64489cb00..c133270ca 100644 --- a/examples/use_cases/plot_employee_salaries.py +++ b/examples/use_cases/plot_employee_salaries.py @@ -298,6 +298,11 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): ) results +# %% +# .. note:: +# We could have also used the :class:`skore.ComparisonReport` to compare estimator +# reports. + # %% # # Finally, we can even get the individual :class:`~skore.EstimatorReport` for each fold diff --git a/skore/src/skore/__init__.py b/skore/src/skore/__init__.py index c4c7d1c04..b8d06f5cf 100644 --- a/skore/src/skore/__init__.py +++ b/skore/src/skore/__init__.py @@ -8,6 +8,7 @@ from skore._config import config_context, get_config, set_config from skore.project import Project, open from skore.sklearn import ( + ComparisonReport, CrossValidationReport, EstimatorReport, PrecisionRecallCurveDisplay, @@ -20,6 +21,7 @@ __all__ = [ "CrossValidationReport", + "ComparisonReport", "EstimatorReport", "PrecisionRecallCurveDisplay", "PredictionErrorDisplay", diff --git a/skore/src/skore/sklearn/__init__.py b/skore/src/skore/sklearn/__init__.py index 0b5858999..f1abb357c 100644 --- a/skore/src/skore/sklearn/__init__.py +++ b/skore/src/skore/sklearn/__init__.py @@ -1,5 +1,6 @@ """Enhance `sklearn` functions.""" +from skore.sklearn._comparison import ComparisonReport from skore.sklearn._cross_validation import CrossValidationReport from skore.sklearn._estimator import EstimatorReport from skore.sklearn._plot import ( @@ -13,6 +14,7 @@ "train_test_split", "CrossValidationReport", "EstimatorReport", + "ComparisonReport", "RocCurveDisplay", "PrecisionRecallCurveDisplay", "PredictionErrorDisplay", diff --git a/skore/src/skore/sklearn/_base.py b/skore/src/skore/sklearn/_base.py index d62963cb9..c23290b38 100644 --- a/skore/src/skore/sklearn/_base.py +++ b/skore/src/skore/sklearn/_base.py @@ -124,7 +124,7 @@ def _get_attributes_for_help(self): def _create_help_tree(self): """Create a rich Tree with the available tools and accessor methods.""" - tree = Tree("report") + tree = Tree(self.__class__.__name__) # Add accessor methods first for accessor_attr, config in self._ACCESSOR_CONFIG.items(): diff --git a/skore/src/skore/sklearn/_comparison/__init__.py b/skore/src/skore/sklearn/_comparison/__init__.py new file mode 100644 index 000000000..eb72e33f8 --- /dev/null +++ b/skore/src/skore/sklearn/_comparison/__init__.py @@ -0,0 +1,7 @@ +from skore.externals._pandas_accessors import _register_accessor +from skore.sklearn._comparison.metrics_accessor import _MetricsAccessor +from skore.sklearn._comparison.report import ComparisonReport + +_register_accessor("metrics", ComparisonReport)(_MetricsAccessor) + +__all__ = ["ComparisonReport"] diff --git a/skore/src/skore/sklearn/_comparison/metrics_accessor.py b/skore/src/skore/sklearn/_comparison/metrics_accessor.py new file mode 100644 index 000000000..7f07a3d0c --- /dev/null +++ b/skore/src/skore/sklearn/_comparison/metrics_accessor.py @@ -0,0 +1,1080 @@ +import joblib +import numpy as np +import pandas as pd +from sklearn.metrics import make_scorer +from sklearn.utils.metaestimators import available_if + +from skore.externals._pandas_accessors import DirNamesMixin +from skore.sklearn._base import _BaseAccessor +from skore.utils._accessor import _check_supported_ml_task +from skore.utils._progress_bar import progress_decorator + + +class _MetricsAccessor(_BaseAccessor, DirNamesMixin): + """Accessor for metrics-related operations. + + You can access this accessor using the `metrics` attribute. + """ + + _SCORE_OR_LOSS_INFO = { + "accuracy": {"name": "Accuracy", "icon": "(↗︎)"}, + "precision": {"name": "Precision", "icon": "(↗︎)"}, + "recall": {"name": "Recall", "icon": "(↗︎)"}, + "brier_score": {"name": "Brier score", "icon": "(↘︎)"}, + "roc_auc": {"name": "ROC AUC", "icon": "(↗︎)"}, + "log_loss": {"name": "Log loss", "icon": "(↘︎)"}, + "r2": {"name": "R²", "icon": "(↗︎)"}, + "rmse": {"name": "RMSE", "icon": "(↘︎)"}, + "custom_metric": {"name": "Custom metric", "icon": ""}, + "report_metrics": {"name": "Report metrics", "icon": ""}, + } + + def __init__(self, parent): + super().__init__(parent) + + self._parent_progress = None + + def report_metrics( + self, + *, + data_source="test", + X=None, + y=None, + scoring=None, + scoring_names=None, + pos_label=None, + scoring_kwargs=None, + ): + """Report a set of metrics for the estimators. + + Parameters + ---------- + data_source : {"test", "train", "X_y"}, default="test" + The data source to use. + + - "test" : use the test set provided when creating the report. + - "train" : use the train set provided when creating the report. + - "X_y" : use the provided `X` and `y` to compute the metric. + + X : array-like of shape (n_samples, n_features), default=None + New data on which to compute the metric. By default, we use the validation + set provided when creating the report. + + y : array-like of shape (n_samples,), default=None + New target on which to compute the metric. By default, we use the target + provided when creating the report. + + scoring : list of str, callable, or scorer, default=None + The metrics to report. You can get the possible list of strings by calling + `report.metrics.help()`. When passing a callable, it should take as + arguments ``y_true``, ``y_pred`` as the two first arguments. Additional + arguments can be passed as keyword arguments and will be forwarded with + `scoring_kwargs`. If the callable API is too restrictive (e.g. need to pass + same parameter name with different values), you can use scikit-learn scorers + as provided by :func:`sklearn.metrics.make_scorer`. + + scoring_names : list of str, default=None + Used to overwrite the default scoring names in the report. It should be of + the same length as the ``scoring`` parameter. + + pos_label : int, float, bool or str, default=None + The positive class. + + scoring_kwargs : dict, default=None + The keyword arguments to pass to the scoring functions. + + Returns + ------- + pd.DataFrame + The statistics for the metrics. + + Examples + -------- + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import train_test_split + >>> from skore import ComparisonReport, EstimatorReport + >>> X, y = load_breast_cancer(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42) + >>> estimator_report_1 = EstimatorReport( + ... estimator_1, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43) + >>> estimator_report_2 = EstimatorReport( + ... estimator_2, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> comparison_report = ComparisonReport( + ... [estimator_report_1, estimator_report_2] + ... ) + >>> comparison_report.metrics.report_metrics( + ... scoring=["precision", "recall"], + ... pos_label=1, + ... ) + Estimator LogisticRegression LogisticRegression + Metric + Precision (↗︎) 0.96... 0.96... + Recall (↗︎) 0.97... 0.97... + """ + return self._compute_metric_scores( + report_metric_name="report_metrics", + data_source=data_source, + X=X, + y=y, + scoring=scoring, + pos_label=pos_label, + scoring_kwargs=scoring_kwargs, + scoring_names=scoring_names, + ) + + @progress_decorator(description="Compute metric for each split") + def _compute_metric_scores( + self, + report_metric_name, + *, + data_source="test", + X=None, + y=None, + **metric_kwargs, + ): + cache_key = (self._parent._hash, report_metric_name, data_source) + + # we need to enforce the order of the parameter for a specific metric + # to make sure that we hit the cache in a consistent way + ordered_metric_kwargs = sorted(metric_kwargs.keys()) + + for key in ordered_metric_kwargs: + if isinstance(metric_kwargs[key], (np.ndarray, list, dict)): + cache_key += (joblib.hash(metric_kwargs[key]),) + else: + cache_key += (metric_kwargs[key],) + + progress = self._progress_info["current_progress"] + main_task = self._progress_info["current_task"] + + total_estimators = len(self._parent.estimator_reports_) + progress.update(main_task, total=total_estimators) + + if cache_key in self._parent._cache: + results = self._parent._cache[cache_key] + else: + parallel = joblib.Parallel( + n_jobs=self._parent.n_jobs, + return_as="generator", + require="sharedmem", + ) + generator = parallel( + joblib.delayed(getattr(report.metrics, report_metric_name))( + data_source=data_source, + X=X, + y=y, + **metric_kwargs, + ) + for report in self._parent.estimator_reports_ + ) + results = [] + for result in generator: + results.append(result) + progress.update(main_task, advance=1, refresh=True) + + results = pd.concat(results, axis=1) + results.columns = pd.Index(self._parent.report_names_, name="Estimator") + + self._parent._cache[cache_key] = results + return results + + @available_if( + _check_supported_ml_task( + supported_ml_tasks=["binary-classification", "multiclass-classification"] + ) + ) + def accuracy(self, *, data_source="test", X=None, y=None): + """Compute the accuracy score. + + Parameters + ---------- + data_source : {"test", "train", "X_y"}, default="test" + The data source to use. + + - "test" : use the test set provided when creating the report. + - "train" : use the train set provided when creating the report. + - "X_y" : use the provided `X` and `y` to compute the metric. + + X : array-like of shape (n_samples, n_features), default=None + New data on which to compute the metric. By default, we use the validation + set provided when creating the report. + + y : array-like of shape (n_samples,), default=None + New target on which to compute the metric. By default, we use the target + provided when creating the report. + + Returns + ------- + pd.DataFrame + The accuracy score. + + Examples + -------- + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import train_test_split + >>> from skore import ComparisonReport, EstimatorReport + >>> X, y = load_breast_cancer(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42) + >>> estimator_report_1 = EstimatorReport( + ... estimator_1, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43) + >>> estimator_report_2 = EstimatorReport( + ... estimator_2, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> comparison_report = ComparisonReport( + ... [estimator_report_1, estimator_report_2] + ... ) + >>> comparison_report.metrics.accuracy() + Estimator LogisticRegression LogisticRegression + Metric + Accuracy (↗︎) 0.96... 0.96... + """ + return self.report_metrics( + scoring=["accuracy"], + data_source=data_source, + X=X, + y=y, + ) + + @available_if( + _check_supported_ml_task( + supported_ml_tasks=["binary-classification", "multiclass-classification"] + ) + ) + def precision( + self, + *, + data_source="test", + X=None, + y=None, + average=None, + pos_label=None, + ): + """Compute the precision score. + + Parameters + ---------- + data_source : {"test", "train", "X_y"}, default="test" + The data source to use. + + - "test" : use the test set provided when creating the report. + - "train" : use the train set provided when creating the report. + - "X_y" : use the provided `X` and `y` to compute the metric. + + X : array-like of shape (n_samples, n_features), default=None + New data on which to compute the metric. By default, we use the validation + set provided when creating the report. + + y : array-like of shape (n_samples,), default=None + New target on which to compute the metric. By default, we use the target + provided when creating the report. + + average : {"binary", "macro", "micro", "weighted", "samples"} or None, \ + default=None + Used with multiclass problems. + If `None`, the metrics for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + - "binary": Only report results for the class specified by `pos_label`. + This is applicable only if targets (`y_{true,pred}`) are binary. + - "micro": Calculate metrics globally by counting the total true positives, + false negatives and false positives. + - "macro": Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + - "weighted": Calculate metrics for each label, and find their average + weighted by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an F-score + that is not between precision and recall. + - "samples": Calculate metrics for each instance, and find their average + (only meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + + .. note:: + If `pos_label` is specified and `average` is None, then we report + only the statistics of the positive class (i.e. equivalent to + `average="binary"`). + + pos_label : int, float, bool or str, default=None + The positive class. + + Returns + ------- + pd.DataFrame + The precision score. + + Examples + -------- + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import train_test_split + >>> from skore import ComparisonReport, EstimatorReport + >>> X, y = load_breast_cancer(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42) + >>> estimator_report_1 = EstimatorReport( + ... estimator_1, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43) + >>> estimator_report_2 = EstimatorReport( + ... estimator_2, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> comparison_report = ComparisonReport( + ... [estimator_report_1, estimator_report_2] + ... ) + >>> comparison_report.metrics.precision() + Estimator LogisticRegression LogisticRegression + Metric Label / Average + Precision (↗︎) 0 0.96... 0.96... + 1 0.96... 0.96... + """ + return self.report_metrics( + scoring=["precision"], + data_source=data_source, + X=X, + y=y, + pos_label=pos_label, + scoring_kwargs={"average": average}, + ) + + @available_if( + _check_supported_ml_task( + supported_ml_tasks=["binary-classification", "multiclass-classification"] + ) + ) + def recall( + self, + *, + data_source="test", + X=None, + y=None, + average=None, + pos_label=None, + ): + """Compute the recall score. + + Parameters + ---------- + data_source : {"test", "train"}, default="test" + The data source to use. + + - "test" : use the test set provided when creating the report. + - "train" : use the train set provided when creating the report. + - "X_y" : use the provided `X` and `y` to compute the metric. + + X : array-like of shape (n_samples, n_features), default=None + New data on which to compute the metric. By default, we use the validation + set provided when creating the report. + + y : array-like of shape (n_samples,), default=None + New target on which to compute the metric. By default, we use the target + provided when creating the report. + + average : {"binary","macro", "micro", "weighted", "samples"} or None, \ + default=None + Used with multiclass problems. + If `None`, the metrics for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + - "binary": Only report results for the class specified by `pos_label`. + This is applicable only if targets (`y_{true,pred}`) are binary. + - "micro": Calculate metrics globally by counting the total true positives, + false negatives and false positives. + - "macro": Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + - "weighted": Calculate metrics for each label, and find their average + weighted by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an F-score + that is not between precision and recall. Weighted recall is equal to + accuracy. + - "samples": Calculate metrics for each instance, and find their average + (only meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + + .. note:: + If `pos_label` is specified and `average` is None, then we report + only the statistics of the positive class (i.e. equivalent to + `average="binary"`). + + pos_label : int, float, bool or str, default=None + The positive class. + + Returns + ------- + pd.DataFrame + The recall score. + + Examples + -------- + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import train_test_split + >>> from skore import ComparisonReport, EstimatorReport + >>> X, y = load_breast_cancer(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42) + >>> estimator_report_1 = EstimatorReport( + ... estimator_1, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43) + >>> estimator_report_2 = EstimatorReport( + ... estimator_2, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> comparison_report = ComparisonReport( + ... [estimator_report_1, estimator_report_2] + ... ) + >>> comparison_report.metrics.recall() + Estimator LogisticRegression LogisticRegression + Metric Label / Average + Recall (↗︎) 0 0.944... 0.944... + 1 0.977... 0.977... + """ + return self.report_metrics( + scoring=["recall"], + data_source=data_source, + X=X, + y=y, + pos_label=pos_label, + scoring_kwargs={"average": average}, + ) + + @available_if( + _check_supported_ml_task(supported_ml_tasks=["binary-classification"]) + ) + def brier_score( + self, + *, + data_source="test", + X=None, + y=None, + ): + """Compute the Brier score. + + Parameters + ---------- + data_source : {"test", "train"}, default="test" + The data source to use. + + - "test" : use the test set provided when creating the report. + - "train" : use the train set provided when creating the report. + - "X_y" : use the provided `X` and `y` to compute the metric. + + X : array-like of shape (n_samples, n_features), default=None + New data on which to compute the metric. By default, we use the validation + set provided when creating the report. + + y : array-like of shape (n_samples,), default=None + New target on which to compute the metric. By default, we use the target + provided when creating the report. + + Returns + ------- + pd.DataFrame + The Brier score. + + Examples + -------- + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import train_test_split + >>> from skore import ComparisonReport, EstimatorReport + >>> X, y = load_breast_cancer(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42) + >>> estimator_report_1 = EstimatorReport( + ... estimator_1, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43) + >>> estimator_report_2 = EstimatorReport( + ... estimator_2, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> comparison_report = ComparisonReport( + ... [estimator_report_1, estimator_report_2] + ... ) + >>> comparison_report.metrics.brier_score() + Estimator LogisticRegression LogisticRegression + Metric + Brier score (↘︎) 0.025... 0.025... + """ + return self.report_metrics( + scoring=["brier_score"], + data_source=data_source, + X=X, + y=y, + ) + + @available_if( + _check_supported_ml_task( + supported_ml_tasks=["binary-classification", "multiclass-classification"] + ) + ) + def roc_auc( + self, + *, + data_source="test", + X=None, + y=None, + average=None, + multi_class="ovr", + ): + """Compute the ROC AUC score. + + Parameters + ---------- + data_source : {"test", "train"}, default="test" + The data source to use. + + - "test" : use the test set provided when creating the report. + - "train" : use the train set provided when creating the report. + - "X_y" : use the provided `X` and `y` to compute the metric. + + X : array-like of shape (n_samples, n_features), default=None + New data on which to compute the metric. By default, we use the validation + set provided when creating the report. + + y : array-like of shape (n_samples,), default=None + New target on which to compute the metric. By default, we use the target + provided when creating the report. + + average : {"auto", "macro", "micro", "weighted", "samples"}, \ + default=None + Average to compute the ROC AUC score in a multiclass setting. By default, + no average is computed. Otherwise, this determines the type of averaging + performed on the data. + + - "micro": Calculate metrics globally by considering each element of + the label indicator matrix as a label. + - "macro": Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + - "weighted": Calculate metrics for each label, and find their average, + weighted by support (the number of true instances for each label). + - "samples": Calculate metrics for each instance, and find their + average. + + .. note:: + Multiclass ROC AUC currently only handles the "macro" and + "weighted" averages. For multiclass targets, `average=None` is only + implemented for `multi_class="ovr"` and `average="micro"` is only + implemented for `multi_class="ovr"`. + + multi_class : {"raise", "ovr", "ovo"}, default="ovr" + The multi-class strategy to use. + + - "raise": Raise an error if the data is multiclass. + - "ovr": Stands for One-vs-rest. Computes the AUC of each class against the + rest. This treats the multiclass case in the same way as the multilabel + case. Sensitive to class imbalance even when `average == "macro"`, + because class imbalance affects the composition of each of the "rest" + groupings. + - "ovo": Stands for One-vs-one. Computes the average AUC of all possible + pairwise combinations of classes. Insensitive to class imbalance when + `average == "macro"`. + + Returns + ------- + pd.DataFrame + The ROC AUC score. + + Examples + -------- + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import train_test_split + >>> from skore import ComparisonReport, EstimatorReport + >>> X, y = load_breast_cancer(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42) + >>> estimator_report_1 = EstimatorReport( + ... estimator_1, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43) + >>> estimator_report_2 = EstimatorReport( + ... estimator_2, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> comparison_report = ComparisonReport( + ... [estimator_report_1, estimator_report_2] + ... ) + >>> comparison_report.metrics.roc_auc() + Estimator LogisticRegression LogisticRegression + Metric + ROC AUC (↗︎) 0.99... 0.99... + """ + return self.report_metrics( + scoring=["roc_auc"], + data_source=data_source, + X=X, + y=y, + scoring_kwargs={"average": average, "multi_class": multi_class}, + ) + + @available_if( + _check_supported_ml_task( + supported_ml_tasks=["binary-classification", "multiclass-classification"] + ) + ) + def log_loss( + self, + *, + data_source="test", + X=None, + y=None, + ): + """Compute the log loss. + + Parameters + ---------- + data_source : {"test", "train"}, default="test" + The data source to use. + + - "test" : use the test set provided when creating the report. + - "train" : use the train set provided when creating the report. + - "X_y" : use the provided `X` and `y` to compute the metric. + + X : array-like of shape (n_samples, n_features), default=None + New data on which to compute the metric. By default, we use the validation + set provided when creating the report. + + y : array-like of shape (n_samples,), default=None + New target on which to compute the metric. By default, we use the target + provided when creating the report. + + Returns + ------- + pd.DataFrame + The log-loss. + + Examples + -------- + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import train_test_split + >>> from skore import ComparisonReport, EstimatorReport + >>> X, y = load_breast_cancer(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42) + >>> estimator_report_1 = EstimatorReport( + ... estimator_1, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43) + >>> estimator_report_2 = EstimatorReport( + ... estimator_2, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> comparison_report = ComparisonReport( + ... [estimator_report_1, estimator_report_2] + ... ) + >>> comparison_report.metrics.log_loss() + Estimator LogisticRegression LogisticRegression + Metric + Log loss (↘︎) 0.082... 0.082... + """ + return self.report_metrics( + scoring=["log_loss"], + data_source=data_source, + X=X, + y=y, + ) + + @available_if(_check_supported_ml_task(supported_ml_tasks=["regression"])) + def r2( + self, + *, + data_source="test", + X=None, + y=None, + multioutput="raw_values", + ): + """Compute the R² score. + + Parameters + ---------- + data_source : {"test", "train"}, default="test" + The data source to use. + + - "test" : use the test set provided when creating the report. + - "train" : use the train set provided when creating the report. + - "X_y" : use the provided `X` and `y` to compute the metric. + + X : array-like of shape (n_samples, n_features), default=None + New data on which to compute the metric. By default, we use the validation + set provided when creating the report. + + y : array-like of shape (n_samples,), default=None + New target on which to compute the metric. By default, we use the target + provided when creating the report. + + multioutput : {"raw_values", "uniform_average"} or array-like of shape \ + (n_outputs,), default="raw_values" + Defines aggregating of multiple output values. Array-like value defines + weights used to average errors. The other possible values are: + + - "raw_values": Returns a full set of errors in case of multioutput input. + - "uniform_average": Errors of all outputs are averaged with uniform weight. + + By default, no averaging is done. + + Returns + ------- + pd.DataFrame + The R² score. + + Examples + -------- + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.linear_model import Ridge + >>> from sklearn.model_selection import train_test_split + >>> from skore import ComparisonReport, EstimatorReport + >>> X, y = load_diabetes(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + >>> estimator_1 = Ridge(random_state=42) + >>> estimator_report_1 = EstimatorReport( + ... estimator_1, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> estimator_2 = Ridge(random_state=43) + >>> estimator_report_2 = EstimatorReport( + ... estimator_2, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> comparison_report = ComparisonReport( + ... [estimator_report_1, estimator_report_2] + ... ) + >>> comparison_report.metrics.r2() + Estimator Ridge Ridge + Metric + R² (↗︎) 0.43... 0.43... + """ + return self.report_metrics( + scoring=["r2"], + data_source=data_source, + X=X, + y=y, + scoring_kwargs={"multioutput": multioutput}, + ) + + @available_if(_check_supported_ml_task(supported_ml_tasks=["regression"])) + def rmse( + self, + *, + data_source="test", + X=None, + y=None, + multioutput="raw_values", + ): + """Compute the root mean squared error. + + Parameters + ---------- + data_source : {"test", "train"}, default="test" + The data source to use. + + - "test" : use the test set provided when creating the report. + - "train" : use the train set provided when creating the report. + - "X_y" : use the provided `X` and `y` to compute the metric. + + X : array-like of shape (n_samples, n_features), default=None + New data on which to compute the metric. By default, we use the validation + set provided when creating the report. + + y : array-like of shape (n_samples,), default=None + New target on which to compute the metric. By default, we use the target + provided when creating the report. + + multioutput : {"raw_values", "uniform_average"} or array-like of shape \ + (n_outputs,), default="raw_values" + Defines aggregating of multiple output values. Array-like value defines + weights used to average errors. The other possible values are: + + - "raw_values": Returns a full set of errors in case of multioutput input. + - "uniform_average": Errors of all outputs are averaged with uniform weight. + + By default, no averaging is done. + + Returns + ------- + pd.DataFrame + The root mean squared error. + + Examples + -------- + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.linear_model import Ridge + >>> from sklearn.model_selection import train_test_split + >>> from skore import ComparisonReport, EstimatorReport + >>> X, y = load_diabetes(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + >>> estimator_1 = Ridge(random_state=42) + >>> estimator_report_1 = EstimatorReport( + ... estimator_1, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> estimator_2 = Ridge(random_state=43) + >>> estimator_report_2 = EstimatorReport( + ... estimator_2, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> comparison_report = ComparisonReport( + ... [estimator_report_1, estimator_report_2] + ... ) + >>> comparison_report.metrics.rmse() + Estimator Ridge Ridge + Metric + RMSE (↘︎) 55.726... 55.726... + """ + return self.report_metrics( + scoring=["rmse"], + data_source=data_source, + X=X, + y=y, + scoring_kwargs={"multioutput": multioutput}, + ) + + def custom_metric( + self, + metric_function, + response_method, + *, + metric_name=None, + data_source="test", + X=None, + y=None, + **kwargs, + ): + """Compute a custom metric. + + It brings some flexibility to compute any desired metric. However, we need to + follow some rules: + + - `metric_function` should take `y_true` and `y_pred` as the first two + positional arguments. + - `response_method` corresponds to the estimator's method to be invoked to get + the predictions. It can be a string or a list of strings to defined in which + order the methods should be invoked. + + Parameters + ---------- + metric_function : callable + The metric function to be computed. The expected signature is + `metric_function(y_true, y_pred, **kwargs)`. + + response_method : str or list of str + The estimator's method to be invoked to get the predictions. The possible + values are: `predict`, `predict_proba`, `predict_log_proba`, and + `decision_function`. + + metric_name : str, default=None + The name of the metric. If not provided, it will be inferred from the + metric function. + + data_source : {"test", "train"}, default="test" + The data source to use. + + - "test" : use the test set provided when creating the report. + - "train" : use the train set provided when creating the report. + - "X_y" : use the provided `X` and `y` to compute the metric. + + X : array-like of shape (n_samples, n_features), default=None + New data on which to compute the metric. By default, we use the validation + set provided when creating the report. + + y : array-like of shape (n_samples,), default=None + New target on which to compute the metric. By default, we use the target + provided when creating the report. + + **kwargs : dict + Any additional keyword arguments to be passed to the metric function. + + Returns + ------- + pd.DataFrame + The custom metric. + + Examples + -------- + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.linear_model import Ridge + >>> from sklearn.metrics import mean_absolute_error + >>> from sklearn.model_selection import train_test_split + >>> from skore import ComparisonReport, EstimatorReport + >>> X, y = load_diabetes(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + >>> estimator_1 = Ridge(random_state=42) + >>> estimator_report_1 = EstimatorReport( + ... estimator_1, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> estimator_2 = Ridge(random_state=43) + >>> estimator_report_2 = EstimatorReport( + ... estimator_2, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test, + ... ) + >>> comparison_report = ComparisonReport( + ... [estimator_report_1, estimator_report_2] + ... ) + >>> comparison_report.metrics.custom_metric( + ... metric_function=mean_absolute_error, + ... response_method="predict", + ... metric_name="MAE (↗︎)", + ... ) + Estimator Ridge Ridge + Metric + MAE (↗︎) 45.91... 45.91... + """ + # create a scorer with `greater_is_better=True` to not alter the output of + # `metric_function` + scorer = make_scorer( + metric_function, + greater_is_better=True, + response_method=response_method, + **kwargs, + ) + return self.report_metrics( + scoring=[scorer], + data_source=data_source, + X=X, + y=y, + scoring_names=[metric_name], + ) + + #################################################################################### + # Methods related to the help tree + #################################################################################### + + def _sort_methods_for_help(self, methods): + """Override sort method for metrics-specific ordering. + + In short, we display the `report_metrics` first and then the `custom_metric`. + """ + + def _sort_key(method): + name = method[0] + if name == "custom_metric": + priority = 1 + elif name == "report_metrics": + priority = 2 + else: + priority = 0 + return priority, name + + return sorted(methods, key=_sort_key) + + def _format_method_name(self, name): + """Override format method for metrics-specific naming.""" + method_name = f"{name}(...)" + method_name = method_name.ljust(22) + if name in self._SCORE_OR_LOSS_INFO and self._SCORE_OR_LOSS_INFO[name][ + "icon" + ] in ("(↗︎)", "(↘︎)"): + if self._SCORE_OR_LOSS_INFO[name]["icon"] == "(↗︎)": + method_name += f"[cyan]{self._SCORE_OR_LOSS_INFO[name]['icon']}[/cyan]" + return method_name.ljust(43) + else: # (↘︎) + method_name += ( + f"[orange1]{self._SCORE_OR_LOSS_INFO[name]['icon']}[/orange1]" + ) + return method_name.ljust(49) + else: + return method_name.ljust(29) + + def _get_methods_for_help(self): + """Override to exclude the plot accessor from methods list.""" + methods = super()._get_methods_for_help() + return [(name, method) for name, method in methods if name != "plot"] + + def _get_help_panel_title(self): + return "[bold cyan]Available metrics methods[/bold cyan]" + + def _get_help_legend(self): + return ( + "[cyan](↗︎)[/cyan] higher is better [orange1](↘︎)[/orange1] lower is better" + ) + + def _get_help_tree_title(self): + return "[bold cyan]report.metrics[/bold cyan]" + + def __repr__(self): + """Return a string representation using rich.""" + return self._rich_repr( + class_name="skore.ComparisonReport.metrics", + help_method_name="report.metrics.help()", + ) diff --git a/skore/src/skore/sklearn/_comparison/report.py b/skore/src/skore/sklearn/_comparison/report.py new file mode 100644 index 000000000..ac4c3a30f --- /dev/null +++ b/skore/src/skore/sklearn/_comparison/report.py @@ -0,0 +1,170 @@ +from __future__ import annotations + +import time +from collections.abc import Iterable +from typing import Optional, Union + +import joblib +import numpy as np + +from skore.externals._pandas_accessors import DirNamesMixin +from skore.sklearn._base import _BaseReport +from skore.sklearn._estimator.report import EstimatorReport + + +class ComparisonReport(_BaseReport, DirNamesMixin): + """Report for comparison of instances of :class:`skore.EstimatorReport`. + + Caution: reports passed to `ComparisonReport` are not copied. If you pass + a report to `ComparisonReport`, and then modify the report outside later, it will + affect the report stored inside the `ComparisonReport` as well, which can lead to + inconsistent results. For this reason, modifying reports after creation is strongly + discouraged. + + Parameters + ---------- + reports : list of :class:`~skore.EstimatorReport` instances or dict + Estimator reports to compare. + + * If `reports` is a list, the class name of each estimator is used. + * If `reports` is a dict, it is expected to have estimator names as keys + and :class:`~skore.EstimatorReport` instances as values. + If the keys are not strings, they will be converted to strings. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimators and computing + the scores are parallelized. + When accessing some methods of the `ComparisonReport`, the `n_jobs` + parameter is used to parallelize the computation. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. + + Attributes + ---------- + estimator_reports_ : list of `~skore.EstimatorReport` + The compared estimator reports. + + report_names_ : list of str + The names of the compared estimator reports. + + See Also + -------- + skore.EstimatorReport + Report for a fitted estimator. + + skore.CrossValidationReport + Report for the cross-validation of an estimator. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.linear_model import LogisticRegression + >>> from skore import ComparisonReport, EstimatorReport + >>> X, y = make_classification(random_state=42) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + >>> estimator_1 = LogisticRegression() + >>> estimator_report_1 = EstimatorReport( + ... estimator_1, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test + ... ) + >>> estimator_2 = LogisticRegression(C=2) # Different regularization + >>> estimator_report_2 = EstimatorReport( + ... estimator_2, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test + ... ) + >>> report = ComparisonReport([estimator_report_1, estimator_report_2]) + ... + >>> report = ComparisonReport( + ... {"model1": estimator_report_1, "model2": estimator_report_2} + ... ) + ... + """ + + _ACCESSOR_CONFIG = { + "metrics": {"name": "metrics"}, + } + + def __init__( + self, + reports: Union[list[EstimatorReport], dict[str, EstimatorReport]], + *, + n_jobs: Optional[int] = None, + ): + """ + ComparisonReport instance initializer. + + Notes + ----- + We check that the estimator reports can be compared: + - all reports are estimator reports, + - all estimators are in the same ML use case, + - all estimators have non-empty X_test and y_test, + - all estimators have the same X_test and y_test. + """ + if not isinstance(reports, Iterable): + raise TypeError(f"Expected reports to be an iterable; got {type(reports)}") + + if len(reports) < 2: + raise ValueError("At least 2 instances of EstimatorReport are needed") + + report_names = ( + list(map(str, reports.keys())) if isinstance(reports, dict) else None + ) + reports = list(reports.values()) if isinstance(reports, dict) else reports + + if not all(isinstance(report, EstimatorReport) for report in reports): + raise TypeError("Expected instances of EstimatorReport") + + test_dataset_hashes = { + joblib.hash((report.X_test, report.y_test)) + for report in reports + if not ((report.X_test is None) and (report.y_test is None)) + } + if len(test_dataset_hashes) > 1: + raise ValueError("Expected all estimators to have the same testing data.") + + ml_tasks = {report: report._ml_task for report in reports} + if len(set(ml_tasks.values())) > 1: + raise ValueError( + f"Expected all estimators to have the same ML usecase; " + f"got {ml_tasks}" + ) + + if report_names is None: + self.report_names_ = [report.estimator_name_ for report in reports] + else: + self.report_names_ = report_names + + self.estimator_reports_ = reports + + # NEEDED FOR METRICS ACCESSOR + self.n_jobs = n_jobs + self._rng = np.random.default_rng(time.time_ns()) + self._hash = self._rng.integers( + low=np.iinfo(np.int64).min, high=np.iinfo(np.int64).max + ) + self._cache = {} + self._ml_task = self.estimator_reports_[0]._ml_task + + #################################################################################### + # Methods related to the help and repr + #################################################################################### + + def _get_help_panel_title(self): + return "[bold cyan]Tools to compare estimators[/bold cyan]" + + def _get_help_legend(self): + return ( + "[cyan](↗︎)[/cyan] higher is better [orange1](↘︎)[/orange1] lower is better" + ) + + def __repr__(self): + """Return a string representation.""" + return f"{self.__class__.__name__}(...)" diff --git a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py index 78807b280..87582939b 100644 --- a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py +++ b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py @@ -137,16 +137,15 @@ def _compute_metric_scores( cache_key = (self._parent._hash, report_metric_name, data_source) cache_key += (aggregate,) if aggregate is None else tuple(aggregate) - if metric_kwargs != {}: - # we need to enforce the order of the parameter for a specific metric - # to make sure that we hit the cache in a consistent way - ordered_metric_kwargs = sorted(metric_kwargs.keys()) - - for key in ordered_metric_kwargs: - if isinstance(metric_kwargs[key], (np.ndarray, list, dict)): - cache_key += (joblib.hash(metric_kwargs[key]),) - else: - cache_key += (metric_kwargs[key],) + # we need to enforce the order of the parameter for a specific metric + # to make sure that we hit the cache in a consistent way + ordered_metric_kwargs = sorted(metric_kwargs.keys()) + + for key in ordered_metric_kwargs: + if isinstance(metric_kwargs[key], (np.ndarray, list, dict)): + cache_key += (joblib.hash(metric_kwargs[key]),) + else: + cache_key += (metric_kwargs[key],) progress = self._progress_info["current_progress"] main_task = self._progress_info["current_task"] diff --git a/skore/src/skore/sklearn/_cross_validation/report.py b/skore/src/skore/sklearn/_cross_validation/report.py index a767e2104..978a4a96b 100644 --- a/skore/src/skore/sklearn/_cross_validation/report.py +++ b/skore/src/skore/sklearn/_cross_validation/report.py @@ -84,13 +84,12 @@ class CrossValidationReport(_BaseReport, DirNamesMixin): See Also -------- - skore.sklearn.estimator.report.EstimatorReport : + skore.EstimatorReport Report for a fitted estimator. Examples -------- >>> from sklearn.datasets import make_classification - >>> from sklearn.model_selection import train_test_split >>> from sklearn.linear_model import LogisticRegression >>> X, y = make_classification(random_state=42) >>> estimator = LogisticRegression() diff --git a/skore/src/skore/sklearn/_estimator/metrics_accessor.py b/skore/src/skore/sklearn/_estimator/metrics_accessor.py index d39f9157e..546d67d6f 100644 --- a/skore/src/skore/sklearn/_estimator/metrics_accessor.py +++ b/skore/src/skore/sklearn/_estimator/metrics_accessor.py @@ -363,18 +363,18 @@ def _compute_metric_scores( metric_params = inspect.signature(metric_fn).parameters if "pos_label" in metric_params: cache_key += (pos_label,) - if metric_kwargs != {}: - # we need to enforce the order of the parameter for a specific metric - # to make sure that we hit the cache in a consistent way - ordered_metric_kwargs = sorted(metric_kwargs.keys()) - cache_key += tuple( - ( - joblib.hash(metric_kwargs[key]) - if isinstance(metric_kwargs[key], np.ndarray) - else metric_kwargs[key] - ) - for key in ordered_metric_kwargs + + # we need to enforce the order of the parameter for a specific metric + # to make sure that we hit the cache in a consistent way + ordered_metric_kwargs = sorted(metric_kwargs.keys()) + cache_key += tuple( + ( + joblib.hash(metric_kwargs[key]) + if isinstance(metric_kwargs[key], np.ndarray) + else metric_kwargs[key] ) + for key in ordered_metric_kwargs + ) if cache_key in self._parent._cache: score = self._parent._cache[cache_key] diff --git a/skore/tests/unit/sklearn/test_comparison.py b/skore/tests/unit/sklearn/test_comparison.py new file mode 100644 index 000000000..622d06fd4 --- /dev/null +++ b/skore/tests/unit/sklearn/test_comparison.py @@ -0,0 +1,536 @@ +import re +from io import BytesIO + +import joblib +import pandas as pd +import pytest +from sklearn.datasets import make_classification +from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.model_selection import train_test_split +from skore import ComparisonReport, EstimatorReport + + +@pytest.fixture +def binary_classification_model(): + """Create a binary classification dataset and return fitted estimator and data.""" + X, y = make_classification(random_state=42) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + + return LogisticRegression(random_state=42), X_train, X_test, y_train, y_test + + +@pytest.fixture +def regression_model(): + """Create a binary classification dataset and return fitted estimator and data.""" + X, y = make_classification(random_state=42) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + + return LinearRegression(), X_train, X_test, y_train, y_test + + +def test_comparison_report_init_wrong_parameters(binary_classification_model): + """If the input is not valid, raise.""" + + estimator, _, X_test, _, y_test = binary_classification_model + estimator_report = EstimatorReport( + estimator, + fit=False, + X_test=X_test, + y_test=y_test, + ) + + with pytest.raises(TypeError, match="Expected reports to be an iterable"): + ComparisonReport(estimator_report) + + with pytest.raises( + ValueError, match="At least 2 instances of EstimatorReport are needed" + ): + ComparisonReport([estimator_report]) + + with pytest.raises(TypeError, match="Expected instances of EstimatorReport"): + ComparisonReport([None, estimator_report]) + + +def test_comparison_report_without_testing_data(binary_classification_model): + """If there is no test data (`None`) for some estimator report, + initialization works, but computing metrics can fail. + """ + estimator, _, _, _, _ = binary_classification_model + estimator_report = EstimatorReport(estimator, fit=False) + + report = ComparisonReport([estimator_report, estimator_report]) + + with pytest.raises(ValueError, match="No test data"): + report.metrics.report_metrics(data_source="test") + + +def test_comparison_report_different_test_data(binary_classification_model): + """Raise an error if the passed estimators do not have the same testing data.""" + estimator, X_train, X_test, y_train, y_test = binary_classification_model + estimator.fit(X_train, y_train) + + # The estimators that have testing data, need to have the same testing data + # The estimators that do not have testing data do not count + with pytest.raises( + ValueError, match="Expected all estimators to have the same testing data" + ): + ComparisonReport( + [ + EstimatorReport(estimator, X_test=X_test, y_test=y_test), + EstimatorReport(estimator, X_test=X_test[1:], y_test=y_test[1:]), + ] + ) + + # The estimators without testing data (i.e. no X_test and no y_test) do not count + ComparisonReport( + [ + EstimatorReport(estimator, X_test=X_test, y_test=y_test), + EstimatorReport(estimator, X_test=X_test, y_test=y_test), + EstimatorReport(estimator), + ] + ) + + # If there is an X_test but no y_test, it counts + with pytest.raises( + ValueError, match="Expected all estimators to have the same testing data" + ): + ComparisonReport( + [ + EstimatorReport(estimator, fit=False, X_test=X_test, y_test=y_test), + EstimatorReport(estimator, fit=False, X_test=X_test), + ] + ) + + +def test_comparison_report_init_different_ml_usecases( + binary_classification_model, regression_model +): + """Raise an error if the passed estimators do not have the same ML usecase.""" + linear_regression_estimator, _, X_test, _, y_test = regression_model + linear_regression_report = EstimatorReport( + linear_regression_estimator, + fit=False, + X_test=X_test, + y_test=y_test, + ) + + logistic_regression_estimator, _, X_test, _, y_test = binary_classification_model + logistic_regression_report = EstimatorReport( + logistic_regression_estimator, + fit=False, + X_test=X_test, + y_test=y_test, + ) + + with pytest.raises( + ValueError, match="Expected all estimators to have the same ML usecase" + ): + ComparisonReport([linear_regression_report, logistic_regression_report]) + + +def test_comparison_report_init_with_report_names(binary_classification_model): + """If the estimators are passed as a dict, + then the estimator names are the dict keys.""" + estimator, X_train, X_test, y_train, y_test = binary_classification_model + estimator_report = EstimatorReport( + estimator, + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + ) + + comp = ComparisonReport({"r1": estimator_report, "r2": estimator_report}) + + pd.testing.assert_index_equal( + comp.metrics.accuracy().columns, + pd.Index(["r1", "r2"], name="Estimator"), + ) + + +def test_comparison_report_init_without_report_names(binary_classification_model): + """If the estimators are passed as a list, + then the estimator names are the estimator class names.""" + estimator, X_train, X_test, y_train, y_test = binary_classification_model + estimator_report = EstimatorReport( + estimator, + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + ) + + comp = ComparisonReport([estimator_report, estimator_report]) + + pd.testing.assert_index_equal( + comp.metrics.accuracy().columns, + pd.Index(["LogisticRegression", "LogisticRegression"], name="Estimator"), + ) + + +def test_comparison_report_non_string_report_names(binary_classification_model): + """If the estimators are passed as a dict with non-string keys, + then the estimator names are the dict keys converted to strings.""" + estimator, _, X_test, _, y_test = binary_classification_model + estimator_report = EstimatorReport( + estimator, + fit=False, + X_test=X_test, + y_test=y_test, + ) + + report = ComparisonReport({0: estimator_report, "1": estimator_report}) + assert report.report_names_ == ["0", "1"] + + +def test_comparison_report_help(capsys, binary_classification_model): + """Check the help menu works.""" + estimator, _, X_test, _, y_test = binary_classification_model + estimator_report = EstimatorReport( + estimator, + fit=False, + X_test=X_test, + y_test=y_test, + ) + + ComparisonReport([estimator_report, estimator_report]).help() + + captured = capsys.readouterr() + assert "Tools to compare estimators" in captured.out + + # Check that we have a line with accuracy and the arrow associated with it + assert re.search( + r"\.accuracy\([^)]*\).*\(↗︎\).*-.*accuracy", captured.out, re.MULTILINE + ) + + +def test_comparison_report_repr(binary_classification_model): + """Check the `__repr__` works.""" + estimator, _, X_test, _, y_test = binary_classification_model + estimator_report = EstimatorReport( + estimator, + fit=False, + X_test=X_test, + y_test=y_test, + ) + + repr_str = repr(ComparisonReport([estimator_report, estimator_report])) + + assert "ComparisonReport" in repr_str + + +def test_comparison_report_pickle(tmp_path, binary_classification_model): + """Check that we can pickle a comparison report.""" + estimator, _, X_test, _, y_test = binary_classification_model + estimator_report = EstimatorReport( + estimator, + fit=False, + X_test=X_test, + y_test=y_test, + ) + + with BytesIO() as stream: + joblib.dump(ComparisonReport([estimator_report, estimator_report]), stream) + + +def test_comparison_report_metrics_help(capsys, binary_classification_model): + """Check that the help method writes to the console.""" + estimator, _, X_test, _, y_test = binary_classification_model + estimator_report = EstimatorReport( + estimator, + fit=False, + X_test=X_test, + y_test=y_test, + ) + report = ComparisonReport([estimator_report, estimator_report]) + + report.metrics.help() + captured = capsys.readouterr() + assert "Available metrics methods" in captured.out + + +def test_comparison_report_metrics_repr(binary_classification_model): + """Check the repr method.""" + estimator, _, X_test, _, y_test = binary_classification_model + estimator_report = EstimatorReport( + estimator, + fit=False, + X_test=X_test, + y_test=y_test, + ) + report = ComparisonReport([estimator_report, estimator_report]) + + repr_str = repr(report.metrics) + assert "skore.ComparisonReport.metrics" in repr_str + assert "report.metrics.help()" in repr_str + + +@pytest.mark.parametrize("data_source", ["test", "X_y"]) +@pytest.mark.parametrize( + "metric_name, expected", + [ + ( + "accuracy", + pd.DataFrame( + [[1.0, 1.0]], + columns=pd.Index( + ["LogisticRegression", "LogisticRegression"], + name="Estimator", + ), + index=pd.Index(["Accuracy (↗︎)"], dtype="object", name="Metric"), + ), + ), + ( + "precision", + pd.DataFrame( + [[1.0, 1.0], [1.0, 1.0]], + columns=pd.Index( + ["LogisticRegression", "LogisticRegression"], + name="Estimator", + ), + index=pd.MultiIndex.from_tuples( + [("Precision (↗︎)", 0), ("Precision (↗︎)", 1)], + names=["Metric", "Label / Average"], + ), + ), + ), + ( + "recall", + pd.DataFrame( + [[1.0, 1.0], [1.0, 1.0]], + columns=pd.Index( + ["LogisticRegression", "LogisticRegression"], + name="Estimator", + ), + index=pd.MultiIndex.from_tuples( + [("Recall (↗︎)", 0), ("Recall (↗︎)", 1)], + names=["Metric", "Label / Average"], + ), + ), + ), + ( + "brier_score", + pd.DataFrame( + [[0.026684, 0.026684]], + columns=pd.Index( + ["LogisticRegression", "LogisticRegression"], + name="Estimator", + ), + index=pd.Index(["Brier score (↘︎)"], dtype="object", name="Metric"), + ), + ), + ( + "roc_auc", + pd.DataFrame( + [[1.0, 1.0]], + columns=pd.Index( + ["LogisticRegression", "LogisticRegression"], + name="Estimator", + ), + index=pd.Index(["ROC AUC (↗︎)"], dtype="object", name="Metric"), + ), + ), + ( + "log_loss", + pd.DataFrame( + [[0.113233, 0.113233]], + columns=pd.Index( + ["LogisticRegression", "LogisticRegression"], + name="Estimator", + ), + index=pd.Index(["Log loss (↘︎)"], dtype="object", name="Metric"), + ), + ), + ], +) +def test_comparison_report_metrics_binary_classification( + metric_name, expected, data_source, binary_classification_model +): + """Check the metrics work.""" + estimator, X_train, X_test, y_train, y_test = binary_classification_model + estimator_report = EstimatorReport( + estimator, + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + ) + + comp = ComparisonReport([estimator_report, estimator_report]) + + # ensure metric is valid + if data_source == "X_y": + result = getattr(comp.metrics, metric_name)( + data_source=data_source, X=X_test, y=y_test + ) + else: + result = getattr(comp.metrics, metric_name)(data_source=data_source) + pd.testing.assert_frame_equal(result, expected) + + # ensure metric is valid even from the cache + if data_source == "X_y": + result = getattr(comp.metrics, metric_name)( + data_source=data_source, X=X_test, y=y_test + ) + else: + result = getattr(comp.metrics, metric_name)(data_source=data_source) + pd.testing.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data_source", ["test", "X_y"]) +@pytest.mark.parametrize( + "metric_name, expected", + [ + ( + "rmse", + pd.DataFrame( + [[0.27699, 0.27699]], + columns=pd.Index( + ["LinearRegression", "LinearRegression"], + name="Estimator", + ), + index=pd.Index(["RMSE (↘︎)"], dtype="object", name="Metric"), + ), + ), + ( + "r2", + pd.DataFrame( + [[0.680319, 0.680319]], + columns=pd.Index( + ["LinearRegression", "LinearRegression"], + name="Estimator", + ), + index=pd.Index(["R² (↗︎)"], dtype="object", name="Metric"), + ), + ), + ], +) +def test_comparison_report_metrics_linear_regression( + metric_name, expected, data_source, regression_model +): + """Check the metrics work.""" + estimator, X_train, X_test, y_train, y_test = regression_model + estimator_report = EstimatorReport( + estimator, + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + ) + + comp = ComparisonReport([estimator_report, estimator_report]) + + # ensure metric is valid + if data_source == "X_y": + result = getattr(comp.metrics, metric_name)( + data_source=data_source, X=X_test, y=y_test + ) + else: + result = getattr(comp.metrics, metric_name)() + pd.testing.assert_frame_equal(result, expected) + + # ensure metric is valid even from the cache + if data_source == "X_y": + result = getattr(comp.metrics, metric_name)( + data_source=data_source, X=X_test, y=y_test + ) + else: + result = getattr(comp.metrics, metric_name)() + pd.testing.assert_frame_equal(result, expected) + + +def test_comparison_report_report_metrics_X_y(binary_classification_model): + """Check that `report_metrics` works with an "X_y" data source.""" + estimator, X_train, X_test, y_train, y_test = binary_classification_model + estimator_report = EstimatorReport( + estimator, + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + ) + + comp = ComparisonReport([estimator_report, estimator_report]) + + result = comp.metrics.report_metrics( + data_source="X_y", + X=X_train[:10], + y=y_train[:10], + ) + + expected = pd.DataFrame( + [ + [1.0, 1.0], + [1.0, 1.0], + [1.0, 1.0], + [1.0, 1.0], + [1.0, 1.0], + [0.01514976, 0.01514976], + ], + columns=pd.Index( + ["LogisticRegression", "LogisticRegression"], + name="Estimator", + ), + index=pd.MultiIndex.from_tuples( + [ + ("Precision (↗︎)", 0), + ("Precision (↗︎)", 1), + ("Recall (↗︎)", 0), + ("Recall (↗︎)", 1), + ("ROC AUC (↗︎)", ""), + ("Brier score (↘︎)", ""), + ], + names=["Metric", "Label / Average"], + ), + ) + pd.testing.assert_frame_equal(result, expected) + + assert len(comp._cache) == 1 + cached_result = list(comp._cache.values())[0] + pd.testing.assert_frame_equal(cached_result, expected) + + +def test_comparison_report_custom_metric_X_y(binary_classification_model): + """Check that `custom_metric` works with an "X_y" data source.""" + from sklearn.metrics import mean_absolute_error + + estimator, X_train, X_test, y_train, y_test = binary_classification_model + estimator_report = EstimatorReport( + estimator, + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + ) + + comp = ComparisonReport([estimator_report, estimator_report]) + + expected = pd.DataFrame( + [[0.0, 0.0]], + columns=pd.Index( + ["LogisticRegression", "LogisticRegression"], name="Estimator" + ), + index=pd.Index(["MAE (↗︎)"], name="Metric"), + ) + + # ensure metric is valid + result = comp.metrics.custom_metric( + metric_function=mean_absolute_error, + response_method="predict", + metric_name="MAE (↗︎)", + data_source="X_y", + X=X_test, + y=y_test, + ) + pd.testing.assert_frame_equal(result, expected) + + # ensure metric is valid even from the cache + result = comp.metrics.custom_metric( + metric_function=mean_absolute_error, + response_method="predict", + metric_name="MAE (↗︎)", + data_source="X_y", + X=X_test, + y=y_test, + ) + pd.testing.assert_frame_equal(result, expected) diff --git a/skore/tests/unit/sklearn/test_estimator.py b/skore/tests/unit/sklearn/test_estimator.py index 4fc422b40..35877b7e3 100644 --- a/skore/tests/unit/sklearn/test_estimator.py +++ b/skore/tests/unit/sklearn/test_estimator.py @@ -1,5 +1,6 @@ import re from copy import deepcopy +from io import BytesIO from numbers import Real import joblib @@ -335,7 +336,7 @@ def test_estimator_report_cache_predictions( assert report._cache.keys() == stored_cache.keys() -def test_estimator_report_pickle(tmp_path, binary_classification_data): +def test_estimator_report_pickle(binary_classification_data): """Check that we can pickle an estimator report. In particular, the progress bar from rich are pickable, therefore we trigger @@ -344,7 +345,9 @@ def test_estimator_report_pickle(tmp_path, binary_classification_data): estimator, X_test, y_test = binary_classification_data report = EstimatorReport(estimator, X_test=X_test, y_test=y_test) report.cache_predictions() - joblib.dump(report, tmp_path / "report.joblib") + + with BytesIO() as stream: + joblib.dump(report, stream) def test_estimator_report_flat_index(binary_classification_data): diff --git a/sphinx/api/skore.config_context.rst b/sphinx/api/skore.config_context.rst new file mode 100644 index 000000000..b64a9f9d9 --- /dev/null +++ b/sphinx/api/skore.config_context.rst @@ -0,0 +1,10 @@ +config\_context +=============== + +.. currentmodule:: skore + +.. autofunction:: config_context + +.. minigallery:: skore.config_context + :add-heading: Gallery examples + :heading-level: - \ No newline at end of file diff --git a/sphinx/api/skore.get_config.rst b/sphinx/api/skore.get_config.rst new file mode 100644 index 000000000..0f76d783f --- /dev/null +++ b/sphinx/api/skore.get_config.rst @@ -0,0 +1,10 @@ +get\_config +=========== + +.. currentmodule:: skore + +.. autofunction:: get_config + +.. minigallery:: skore.get_config + :add-heading: Gallery examples + :heading-level: - \ No newline at end of file diff --git a/sphinx/api/skore.set_config.rst b/sphinx/api/skore.set_config.rst new file mode 100644 index 000000000..de1045f07 --- /dev/null +++ b/sphinx/api/skore.set_config.rst @@ -0,0 +1,10 @@ +set\_config +=========== + +.. currentmodule:: skore + +.. autofunction:: set_config + +.. minigallery:: skore.set_config + :add-heading: Gallery examples + :heading-level: - \ No newline at end of file diff --git a/sphinx/index.rst b/sphinx/index.rst index d09f653ec..56a592a80 100644 --- a/sphinx/index.rst +++ b/sphinx/index.rst @@ -41,9 +41,11 @@ Key features All these are computed and generated for you in 1 line of code. Under the hood, we use efficient caching to make the computations blazing fast. - - :class:`skore.CrossValidationReport`: Get a skore estimator report for each fold + - :class:`skore.CrossValidationReport`: get a skore estimator report for each fold of your cross-validation. + - :class:`skore.ComparisonReport`: benchmark your skore estimator reports. + What's next? """""""""""" diff --git a/sphinx/reference/report/comparison_report.rst b/sphinx/reference/report/comparison_report.rst new file mode 100644 index 000000000..7158cc5e1 --- /dev/null +++ b/sphinx/reference/report/comparison_report.rst @@ -0,0 +1,50 @@ +Report for a comparison of :class:`EstimatorReport` +=================================================== + +.. currentmodule:: skore + +The class :class:`ComparisonReport` provides a report allowing to compare :class:`EstimatorReport` instances in an interactive way. The functionalities of the report are accessible through accessors. + +.. autosummary:: + :toctree: ../api/ + :template: base.rst + + ComparisonReport + +.. autosummary:: + :toctree: ../api/ + :nosignatures: + :template: autosummary/accessor_method.rst + + ComparisonReport.help + +.. autosummary:: + :toctree: ../api/ + :nosignatures: + :template: autosummary/accessor.rst + + ComparisonReport.metrics + +Metrics +------- + +The `metrics` accessor helps you to evaluate the statistical performance of the +compared estimators. In addition, we provide a sub-accessor `plot`, to +get the common performance metric representations. + +.. autosummary:: + :toctree: ../api/ + :nosignatures: + :template: autosummary/accessor_method.rst + + ComparisonReport.metrics.help + ComparisonReport.metrics.report_metrics + ComparisonReport.metrics.custom_metric + ComparisonReport.metrics.accuracy + ComparisonReport.metrics.brier_score + ComparisonReport.metrics.log_loss + ComparisonReport.metrics.precision + ComparisonReport.metrics.r2 + ComparisonReport.metrics.recall + ComparisonReport.metrics.rmse + ComparisonReport.metrics.roc_auc diff --git a/sphinx/reference/report/index.rst b/sphinx/reference/report/index.rst index 2a87d824e..17fb6330a 100644 --- a/sphinx/reference/report/index.rst +++ b/sphinx/reference/report/index.rst @@ -42,6 +42,18 @@ scikit-learn estimators by cross-validation, and reporting the results. cross_validation_report +Comparison Report +----------------------- + +:class:`skore.ComparisonReport` provides comprehensive capabilities for comparing +:class:`skore.EstimatorReport` instances, and reporting the results. + +.. toctree:: + :maxdepth: 2 + :hidden: + + comparison_report + Visualization Displays ---------------------- From a763269f2a35123078abd4b6da69a2f56d684a1b Mon Sep 17 00:00:00 2001 From: Auguste Baum Date: Wed, 19 Feb 2025 16:22:06 +0100 Subject: [PATCH 4/9] feat(project): Add parameter to `Project.clear` to delete project (#1322) `project.clear(delete_project=True)` now deletes the entire project, while `project.clear(delete_project=False)` (the default) removes every item from the project. Closes #1294 --- skore/src/skore/project/project.py | 60 +++++++++++++++++++++++- skore/tests/conftest.py | 1 + skore/tests/unit/project/test_project.py | 16 +++++++ 3 files changed, 75 insertions(+), 2 deletions(-) diff --git a/skore/src/skore/project/project.py b/skore/src/skore/project/project.py index 4ae6b2445..ebd731e30 100644 --- a/skore/src/skore/project/project.py +++ b/skore/src/skore/project/project.py @@ -2,6 +2,8 @@ from __future__ import annotations +import functools +import shutil from collections.abc import Iterator from logging import INFO, NullHandler, getLogger from pathlib import Path @@ -16,6 +18,30 @@ logger.setLevel(INFO) +class ProjectDeletedError(Exception): + """A method of a Project was called but the Project is marked as deleted.""" + + +def _raise_if_deleted(method): + """Raise if the underlying Project has been deleted, otherwise execute `method`. + + This wrapper makes it safe to "delete" a Project, even if the Project instance + still exists. + """ + + @functools.wraps(method) + def wrapper(self, *args, **kwargs): + if self._storage_initialized is not True: + raise ProjectDeletedError( + "This Project instance is marked as deleted. " + "Please re-create a Project and discard the current one." + ) + + return method(self, *args, **kwargs) + + return wrapper + + class Project: """ A collection of items persisted in a storage. @@ -95,16 +121,38 @@ def __init__( # Initialize repositories with dedicated storages self._item_repository = ItemRepository(DiskCacheStorage(item_storage_dirpath)) + self._storage_initialized = True + # Check if the project should rejoin a server from skore.project._launch import ServerInfo # avoid circular import self._server_info = ServerInfo.rejoin(self) - def clear(self): - """Clear the project.""" + @_raise_if_deleted + def clear(self, delete_project=False): + """Remove all items from the project. + + .. warning:: + Clearing the project with `delete_project=True` will invalidate the whole + `Project` instance, making it unusable. + A new Project instance can be created using the :class:`skore.Project` + constructor or the :func:`skore.open` function. + + Parameters + ---------- + delete_project : bool + If set, the project will be deleted entirely. + """ + if delete_project: + self._storage_initialized = False + del self._item_repository + shutil.rmtree(self.path) + return + for item_key in self._item_repository: self._item_repository.delete_item(item_key) + @_raise_if_deleted def put( self, key: str, @@ -150,6 +198,7 @@ def put( ), ) + @_raise_if_deleted def get( self, key: str, @@ -211,6 +260,7 @@ def dto(item): raise ValueError('`version` should be -1, "all", or an integer') + @_raise_if_deleted def keys(self) -> list[str]: """ Get all keys of items stored in the project. @@ -222,6 +272,7 @@ def keys(self) -> list[str]: """ return self._item_repository.keys() + @_raise_if_deleted def __iter__(self) -> Iterator[str]: """ Yield the keys of items stored in the project. @@ -233,6 +284,7 @@ def __iter__(self) -> Iterator[str]: """ yield from self._item_repository + @_raise_if_deleted def delete(self, key: str): """Delete the item corresponding to ``key`` from the Project. @@ -248,6 +300,7 @@ def delete(self, key: str): """ self._item_repository.delete_item(key) + @_raise_if_deleted def set_note(self, key: str, note: str, *, version=-1): """Attach a note to key ``key``. @@ -277,6 +330,7 @@ def set_note(self, key: str, note: str, *, version=-1): """ return self._item_repository.set_item_note(key=key, note=note, version=version) + @_raise_if_deleted def get_note(self, key: str, *, version=-1) -> Union[str, None]: """Retrieve a note previously attached to key ``key``. @@ -306,6 +360,7 @@ def get_note(self, key: str, *, version=-1) -> Union[str, None]: """ return self._item_repository.get_item_note(key=key, version=version) + @_raise_if_deleted def delete_note(self, key: str, *, version=-1): """Delete a note previously attached to key ``key``. @@ -333,6 +388,7 @@ def delete_note(self, key: str, *, version=-1): """ return self._item_repository.delete_item_note(key=key, version=version) + @_raise_if_deleted def shutdown_web_ui(self): """Shutdown the web UI server if it is running.""" if self._server_info is None: diff --git a/skore/tests/conftest.py b/skore/tests/conftest.py index bfcce2619..a7e00c375 100644 --- a/skore/tests/conftest.py +++ b/skore/tests/conftest.py @@ -43,6 +43,7 @@ def in_memory_project(monkeypatch): project.path = None project.name = "test" project._item_repository = ItemRepository(storage=InMemoryStorage()) + project._storage_initialized = True return project diff --git a/skore/tests/unit/project/test_project.py b/skore/tests/unit/project/test_project.py index 145a642a2..c5c420259 100644 --- a/skore/tests/unit/project/test_project.py +++ b/skore/tests/unit/project/test_project.py @@ -12,6 +12,7 @@ from PIL import Image from sklearn.ensemble import RandomForestClassifier from skore import Project +from skore.project.project import ProjectDeletedError @pytest.fixture(autouse=True) @@ -49,6 +50,21 @@ def test_clear(tmp_path): assert project.keys() == [] assert project._item_repository.keys() == [] + assert dirpath.exists() + + +def test_clear_delete_project(tmp_path): + dirpath = tmp_path / "my-project.skore" + project = Project(dirpath) + + project.clear(delete_project=True) + assert not dirpath.exists() + + with pytest.raises( + ProjectDeletedError, match="This Project instance is marked as deleted" + ): + project.keys() + def test_put_string_item(in_memory_project): in_memory_project.put("string_item", "Hello, World!") From 5150886230bfbd25551795eff5b18aac3b6a14ac Mon Sep 17 00:00:00 2001 From: Auguste Baum Date: Thu, 20 Feb 2025 09:12:24 +0100 Subject: [PATCH 5/9] feat: Add cache_predictions method to ComparisonReport (#1352) Closes #1346 --- skore/src/skore/sklearn/_comparison/report.py | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/skore/src/skore/sklearn/_comparison/report.py b/skore/src/skore/sklearn/_comparison/report.py index ac4c3a30f..68dd58803 100644 --- a/skore/src/skore/sklearn/_comparison/report.py +++ b/skore/src/skore/sklearn/_comparison/report.py @@ -10,6 +10,7 @@ from skore.externals._pandas_accessors import DirNamesMixin from skore.sklearn._base import _BaseReport from skore.sklearn._estimator.report import EstimatorReport +from skore.utils._progress_bar import progress_decorator class ComparisonReport(_BaseReport, DirNamesMixin): @@ -144,6 +145,9 @@ def __init__( self.estimator_reports_ = reports + # used to know if a parent launches a progress bar manager + self._parent_progress = None + # NEEDED FOR METRICS ACCESSOR self.n_jobs = n_jobs self._rng = np.random.default_rng(time.time_ns()) @@ -153,6 +157,103 @@ def __init__( self._cache = {} self._ml_task = self.estimator_reports_[0]._ml_task + def clear_cache(self): + """Clear the cache. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import train_test_split + >>> from skore import ComparisonReport + >>> X, y = make_classification(random_state=42) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + >>> estimator_1 = LogisticRegression() + >>> estimator_report_1 = EstimatorReport( + ... estimator_1, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test + ... ) + >>> estimator_2 = LogisticRegression(C=2) # Different regularization + >>> estimator_report_2 = EstimatorReport( + ... estimator_2, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test + ... ) + >>> report = ComparisonReport([estimator_report_1, estimator_report_2]) + >>> report.cache_predictions() + >>> report.clear_cache() + >>> report._cache + {} + """ + for report in self.estimator_reports_: + report.clear_cache() + self._cache = {} + + @progress_decorator(description="Estimator predictions") + def cache_predictions(self, response_methods="auto", n_jobs=None): + """Cache the predictions for sub-estimators reports. + + Parameters + ---------- + response_methods : {"auto", "predict", "predict_proba", "decision_function"},\ + default="auto + The methods to use to compute the predictions. + + n_jobs : int, default=None + The number of jobs to run in parallel. If `None`, we use the `n_jobs` + parameter when initializing the report. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import train_test_split + >>> from skore import ComparisonReport + >>> X, y = make_classification(random_state=42) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + >>> estimator_1 = LogisticRegression() + >>> estimator_report_1 = EstimatorReport( + ... estimator_1, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test + ... ) + >>> estimator_2 = LogisticRegression(C=2) # Different regularization + >>> estimator_report_2 = EstimatorReport( + ... estimator_2, + ... X_train=X_train, + ... y_train=y_train, + ... X_test=X_test, + ... y_test=y_test + ... ) + >>> report = ComparisonReport([estimator_report_1, estimator_report_2]) + >>> report.cache_predictions() + >>> report._cache + {...} + """ + if n_jobs is None: + n_jobs = self.n_jobs + + progress = self._progress_info["current_progress"] + main_task = self._progress_info["current_task"] + + total_estimators = len(self.estimator_reports_) + progress.update(main_task, total=total_estimators) + + for estimator_report in self.estimator_reports_: + # Pass the progress manager to child tasks + estimator_report._parent_progress = progress + estimator_report.cache_predictions( + response_methods=response_methods, n_jobs=n_jobs + ) + progress.update(main_task, advance=1, refresh=True) + #################################################################################### # Methods related to the help and repr #################################################################################### From 983106b56bacf292593e91cc350cf196a584776b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Feb 2025 13:15:02 +0100 Subject: [PATCH 6/9] add flat_index to comparison report --- .../sklearn/_comparison/metrics_accessor.py | 19 ++++++++++--- skore/tests/unit/sklearn/test_comparison.py | 28 +++++++++++++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/skore/src/skore/sklearn/_comparison/metrics_accessor.py b/skore/src/skore/sklearn/_comparison/metrics_accessor.py index 7f07a3d0c..7ed812d76 100644 --- a/skore/src/skore/sklearn/_comparison/metrics_accessor.py +++ b/skore/src/skore/sklearn/_comparison/metrics_accessor.py @@ -7,6 +7,7 @@ from skore.externals._pandas_accessors import DirNamesMixin from skore.sklearn._base import _BaseAccessor from skore.utils._accessor import _check_supported_ml_task +from skore.utils._index import flatten_multi_index from skore.utils._progress_bar import progress_decorator @@ -42,8 +43,9 @@ def report_metrics( y=None, scoring=None, scoring_names=None, - pos_label=None, scoring_kwargs=None, + pos_label=None, + flat_index=False, ): """Report a set of metrics for the estimators. @@ -77,11 +79,14 @@ def report_metrics( Used to overwrite the default scoring names in the report. It should be of the same length as the ``scoring`` parameter. + scoring_kwargs : dict, default=None + The keyword arguments to pass to the scoring functions. + pos_label : int, float, bool or str, default=None The positive class. - scoring_kwargs : dict, default=None - The keyword arguments to pass to the scoring functions. + flat_index : bool, default=False + Whether to flatten the `MultiIndex` columns. Returns ------- @@ -124,7 +129,7 @@ def report_metrics( Precision (↗︎) 0.96... 0.96... Recall (↗︎) 0.97... 0.97... """ - return self._compute_metric_scores( + results = self._compute_metric_scores( report_metric_name="report_metrics", data_source=data_source, X=X, @@ -134,6 +139,12 @@ def report_metrics( scoring_kwargs=scoring_kwargs, scoring_names=scoring_names, ) + if flat_index: + if isinstance(results.columns, pd.MultiIndex): + results.columns = flatten_multi_index(results.columns) + if isinstance(results.index, pd.MultiIndex): + results.index = flatten_multi_index(results.index) + return results @progress_decorator(description="Compute metric for each split") def _compute_metric_scores( diff --git a/skore/tests/unit/sklearn/test_comparison.py b/skore/tests/unit/sklearn/test_comparison.py index 622d06fd4..a1786e784 100644 --- a/skore/tests/unit/sklearn/test_comparison.py +++ b/skore/tests/unit/sklearn/test_comparison.py @@ -534,3 +534,31 @@ def test_comparison_report_custom_metric_X_y(binary_classification_model): y=y_test, ) pd.testing.assert_frame_equal(result, expected) + + +def test_cross_validation_report_flat_index(binary_classification_model): + """Check that the index is flattened when `flat_index` is True. + + Since `pos_label` is None, then by default a MultiIndex would be returned. + Here, we force to have a single-index by passing `flat_index=True`. + """ + estimator, X_train, X_test, y_train, y_test = binary_classification_model + report_1 = EstimatorReport( + estimator, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test + ) + report_2 = EstimatorReport( + estimator, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test + ) + report = ComparisonReport({"report_1": report_1, "report_2": report_2}) + result = report.metrics.report_metrics(flat_index=True) + assert result.shape == (6, 2) + assert isinstance(result.index, pd.Index) + assert result.index.tolist() == [ + "Precision (↗︎)_0", + "Precision (↗︎)_1", + "Recall (↗︎)_0", + "Recall (↗︎)_1", + "ROC AUC (↗︎)", + "Brier score (↘︎)", + ] + assert result.columns.tolist() == ["report_1", "report_2"] From 45fc8c5b8a903544fd640901b81d64f15baf9dff Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Feb 2025 13:18:50 +0100 Subject: [PATCH 7/9] new rule for space and # --- skore/src/skore/utils/_index.py | 10 ++++++++-- skore/tests/unit/utils/test_index.py | 14 +++++++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/skore/src/skore/utils/_index.py b/skore/src/skore/utils/_index.py index 1c4e4ed57..92e6ea81a 100644 --- a/skore/src/skore/utils/_index.py +++ b/skore/src/skore/utils/_index.py @@ -5,7 +5,8 @@ def flatten_multi_index(index: pd.MultiIndex) -> pd.Index: """Flatten a pandas MultiIndex into a single-level Index. Flatten a pandas `MultiIndex` into a single-level Index by joining the levels - with underscores. Empty strings are skipped when joining. + with underscores. Empty strings are skipped when joining. Spaces are replaced by + an underscore and "#" are skipped. Parameters ---------- @@ -29,4 +30,9 @@ def flatten_multi_index(index: pd.MultiIndex) -> pd.Index: if not isinstance(index, pd.MultiIndex): raise ValueError("`index` must be a MultiIndex.") - return pd.Index(["_".join(filter(bool, map(str, values))) for values in index]) + return pd.Index( + [ + "_".join(filter(bool, map(str, values))).replace(" ", "_").replace("#", "") + for values in index + ] + ) diff --git a/skore/tests/unit/utils/test_index.py b/skore/tests/unit/utils/test_index.py index 3969cb593..60e576f7f 100644 --- a/skore/tests/unit/utils/test_index.py +++ b/skore/tests/unit/utils/test_index.py @@ -24,10 +24,22 @@ pytest.param( [("a@b", "1#2"), ("c&d", "3$4")], ["letter", "number"], - ["a@b_1#2", "c&d_3$4"], + ["a@b_12", "c&d_3$4"], id="special_chars", ), pytest.param([], ["letter", "number"], [], id="empty"), + pytest.param( + [("hello world", "a b"), ("space test", "x y")], + ["text", "more"], + ["hello_world_a_b", "space_test_x_y"], + id="spaces", + ), + pytest.param( + [("a#b#c", "1#2#3"), ("x#y", "5#6")], + ["text", "numbers"], + ["abc_123", "xy_56"], + id="hash_symbols", + ), ], ) def test_flatten_multi_index(input_tuples, names, expected_values): From 1ec034bd02e4d79f5ba8d15773bfc0c2147a23b0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Feb 2025 13:26:21 +0100 Subject: [PATCH 8/9] update tests and documentation --- .../sklearn/_comparison/metrics_accessor.py | 3 ++- .../_cross_validation/metrics_accessor.py | 3 ++- .../sklearn/_estimator/metrics_accessor.py | 3 ++- skore/src/skore/utils/_index.py | 5 ++++- skore/tests/unit/sklearn/test_comparison.py | 12 ++++++------ .../unit/sklearn/test_cross_validation.py | 16 ++++++++-------- skore/tests/unit/sklearn/test_estimator.py | 12 ++++++------ skore/tests/unit/utils/test_index.py | 18 ++++++++++++------ 8 files changed, 42 insertions(+), 30 deletions(-) diff --git a/skore/src/skore/sklearn/_comparison/metrics_accessor.py b/skore/src/skore/sklearn/_comparison/metrics_accessor.py index 7ed812d76..6e92a62d0 100644 --- a/skore/src/skore/sklearn/_comparison/metrics_accessor.py +++ b/skore/src/skore/sklearn/_comparison/metrics_accessor.py @@ -86,7 +86,8 @@ def report_metrics( The positive class. flat_index : bool, default=False - Whether to flatten the `MultiIndex` columns. + Whether to flatten the `MultiIndex` columns. Flat index will always be lower + case, do not include spaces and remove the hash symbol to ease indexing. Returns ------- diff --git a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py index 87582939b..59b5f6c91 100644 --- a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py +++ b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py @@ -85,7 +85,8 @@ def report_metrics( Function to aggregate the scores across the cross-validation splits. flat_index : bool, default=False - Whether to flatten the `MultiIndex` columns. + Whether to flatten the `MultiIndex` columns. Flat index will always be lower + case, do not include spaces and remove the hash symbol to ease indexing. Returns ------- diff --git a/skore/src/skore/sklearn/_estimator/metrics_accessor.py b/skore/src/skore/sklearn/_estimator/metrics_accessor.py index 546d67d6f..9fae10bc6 100644 --- a/skore/src/skore/sklearn/_estimator/metrics_accessor.py +++ b/skore/src/skore/sklearn/_estimator/metrics_accessor.py @@ -92,7 +92,8 @@ def report_metrics( The positive class. flat_index : bool, default=False - Whether to flatten the multiindex columns. + Whether to flatten the multiindex columns. Flat index will always be lower + case, do not include spaces and remove the hash symbol to ease indexing. Returns ------- diff --git a/skore/src/skore/utils/_index.py b/skore/src/skore/utils/_index.py index 92e6ea81a..5b6a76f66 100644 --- a/skore/src/skore/utils/_index.py +++ b/skore/src/skore/utils/_index.py @@ -32,7 +32,10 @@ def flatten_multi_index(index: pd.MultiIndex) -> pd.Index: return pd.Index( [ - "_".join(filter(bool, map(str, values))).replace(" ", "_").replace("#", "") + "_".join(filter(bool, map(str, values))) + .replace(" ", "_") + .replace("#", "") + .lower() for values in index ] ) diff --git a/skore/tests/unit/sklearn/test_comparison.py b/skore/tests/unit/sklearn/test_comparison.py index a1786e784..94add3cdf 100644 --- a/skore/tests/unit/sklearn/test_comparison.py +++ b/skore/tests/unit/sklearn/test_comparison.py @@ -554,11 +554,11 @@ def test_cross_validation_report_flat_index(binary_classification_model): assert result.shape == (6, 2) assert isinstance(result.index, pd.Index) assert result.index.tolist() == [ - "Precision (↗︎)_0", - "Precision (↗︎)_1", - "Recall (↗︎)_0", - "Recall (↗︎)_1", - "ROC AUC (↗︎)", - "Brier score (↘︎)", + "precision_(↗︎)_0", + "precision_(↗︎)_1", + "recall_(↗︎)_0", + "recall_(↗︎)_1", + "roc_auc_(↗︎)", + "brier_score_(↘︎)", ] assert result.columns.tolist() == ["report_1", "report_2"] diff --git a/skore/tests/unit/sklearn/test_cross_validation.py b/skore/tests/unit/sklearn/test_cross_validation.py index 532b2e4ac..59f8f7bb2 100644 --- a/skore/tests/unit/sklearn/test_cross_validation.py +++ b/skore/tests/unit/sklearn/test_cross_validation.py @@ -231,16 +231,16 @@ def test_cross_validation_report_flat_index(binary_classification_data): assert result.shape == (6, 2) assert isinstance(result.index, pd.Index) assert result.index.tolist() == [ - "Precision (↗︎)_0", - "Precision (↗︎)_1", - "Recall (↗︎)_0", - "Recall (↗︎)_1", - "ROC AUC (↗︎)", - "Brier score (↘︎)", + "precision_(↗︎)_0", + "precision_(↗︎)_1", + "recall_(↗︎)_0", + "recall_(↗︎)_1", + "roc_auc_(↗︎)", + "brier_score_(↘︎)", ] assert result.columns.tolist() == [ - "RandomForestClassifier_Split #0", - "RandomForestClassifier_Split #1", + "randomforestclassifier_split_0", + "randomforestclassifier_split_1", ] diff --git a/skore/tests/unit/sklearn/test_estimator.py b/skore/tests/unit/sklearn/test_estimator.py index 35877b7e3..949d627f7 100644 --- a/skore/tests/unit/sklearn/test_estimator.py +++ b/skore/tests/unit/sklearn/test_estimator.py @@ -362,12 +362,12 @@ def test_estimator_report_flat_index(binary_classification_data): assert result.shape == (6, 1) assert isinstance(result.index, pd.Index) assert result.index.tolist() == [ - "Precision (↗︎)_0", - "Precision (↗︎)_1", - "Recall (↗︎)_0", - "Recall (↗︎)_1", - "ROC AUC (↗︎)", - "Brier score (↘︎)", + "precision_(↗︎)_0", + "precision_(↗︎)_1", + "recall_(↗︎)_0", + "recall_(↗︎)_1", + "roc_auc_(↗︎)", + "brier_score_(↘︎)", ] assert result.columns.tolist() == ["RandomForestClassifier"] diff --git a/skore/tests/unit/utils/test_index.py b/skore/tests/unit/utils/test_index.py index 60e576f7f..1a305b23b 100644 --- a/skore/tests/unit/utils/test_index.py +++ b/skore/tests/unit/utils/test_index.py @@ -7,39 +7,45 @@ "input_tuples, names, expected_values", [ pytest.param( - [("a", 1), ("b", 2)], ["letter", "number"], ["a_1", "b_2"], id="basic" + [("A", 1), ("B", 2)], ["letter", "number"], ["a_1", "b_2"], id="basic" ), pytest.param( - [("a", 1, "x"), ("b", 2, "y")], + [("A", 1, "X"), ("B", 2, "Y")], ["letter", "number", "symbol"], ["a_1_x", "b_2_y"], id="multiple_levels", ), pytest.param( - [("a", None), (None, 2)], + [("A", None), (None, 2)], ["letter", "number"], ["a_nan", "nan_2.0"], id="none_values", ), pytest.param( - [("a@b", "1#2"), ("c&d", "3$4")], + [("A@B", "1#2"), ("C&D", "3$4")], ["letter", "number"], ["a@b_12", "c&d_3$4"], id="special_chars", ), pytest.param([], ["letter", "number"], [], id="empty"), pytest.param( - [("hello world", "a b"), ("space test", "x y")], + [("Hello World", "A B"), ("Space Test", "X Y")], ["text", "more"], ["hello_world_a_b", "space_test_x_y"], id="spaces", ), pytest.param( - [("a#b#c", "1#2#3"), ("x#y", "5#6")], + [("A#B#C", "1#2#3"), ("X#Y", "5#6")], ["text", "numbers"], ["abc_123", "xy_56"], id="hash_symbols", ), + pytest.param( + [("UPPER", "CASE"), ("MiXeD", "cAsE")], + ["text", "type"], + ["upper_case", "mixed_case"], + id="case_sensitivity", + ), ], ) def test_flatten_multi_index(input_tuples, names, expected_values): From 7e58ffe9ed3939f2c4ee94d711ead8afcd956057 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Feb 2025 14:40:44 +0100 Subject: [PATCH 9/9] merge conflict --- skore/tests/unit/sklearn/test_cross_validation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/skore/tests/unit/sklearn/test_cross_validation.py b/skore/tests/unit/sklearn/test_cross_validation.py index ecab041b5..c26cae511 100644 --- a/skore/tests/unit/sklearn/test_cross_validation.py +++ b/skore/tests/unit/sklearn/test_cross_validation.py @@ -231,12 +231,12 @@ def test_cross_validation_report_flat_index(binary_classification_data): assert result.shape == (6, 2) assert isinstance(result.index, pd.Index) assert result.index.tolist() == [ - "precision_(↗︎)_0", - "precision_(↗︎)_1", - "recall_(↗︎)_0", - "recall_(↗︎)_1", - "roc_auc_(↗︎)", - "brier_score_(↘︎)", + "precision_0", + "precision_1", + "recall_0", + "recall_1", + "roc_auc", + "brier_score", ] assert result.columns.tolist() == [ "randomforestclassifier_split_0",