From a4f9014ea9795424039ac66ae0b163a42683103b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Fri, 7 Feb 2025 23:34:22 +0100
Subject: [PATCH 1/9] feat(api): Allow to flatten index in reports

---
 .../_cross_validation/metrics_accessor.py     |  21 +++-
 .../sklearn/_estimator/metrics_accessor.py    |  21 +++-
 skore/src/skore/utils/_index.py               |  66 +++++++++++
 skore/tests/unit/utils/test_index.py          | 108 ++++++++++++++++++
 4 files changed, 206 insertions(+), 10 deletions(-)
 create mode 100644 skore/src/skore/utils/_index.py
 create mode 100644 skore/tests/unit/utils/test_index.py

diff --git a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py
index 99219eca8..4ad1c096f 100644
--- a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py
+++ b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py
@@ -11,6 +11,7 @@
     RocCurveDisplay,
 )
 from skore.utils._accessor import _check_supported_ml_task
+from skore.utils._index import flatten_multiindex
 from skore.utils._progress_bar import progress_decorator
 
 ###############################################################################
@@ -48,9 +49,10 @@ def report_metrics(
         data_source="test",
         scoring=None,
         scoring_names=None,
-        pos_label=None,
         scoring_kwargs=None,
+        pos_label=None,
         aggregate=None,
+        flat_index=False,
     ):
         """Report a set of metrics for our estimator.
 
@@ -75,15 +77,18 @@ def report_metrics(
             Used to overwrite the default scoring names in the report. It should be of
             the same length as the `scoring` parameter.
 
-        pos_label : int, float, bool or str, default=None
-            The positive class.
-
         scoring_kwargs : dict, default=None
             The keyword arguments to pass to the scoring functions.
 
+        pos_label : int, float, bool or str, default=None
+            The positive class.
+
         aggregate : {"mean", "std"} or list of such str, default=None
             Function to aggregate the scores across the cross-validation splits.
 
+        flat_index : bool, default=False
+            Whether to flatten the multiindex columns.
+
         Returns
         -------
         pd.DataFrame
@@ -104,7 +109,7 @@ def report_metrics(
         LogisticRegression mean        0.94...     0.96...
                            std         0.02...     0.02...
         """
-        return self._compute_metric_scores(
+        results = self._compute_metric_scores(
             report_metric_name="report_metrics",
             data_source=data_source,
             aggregate=aggregate,
@@ -113,6 +118,12 @@ def report_metrics(
             scoring_kwargs=scoring_kwargs,
             scoring_names=scoring_names,
         )
+        if flat_index:
+            if isinstance(results.columns, pd.MultiIndex):
+                results.columns = flatten_multiindex(results.columns)
+            if isinstance(results.index, pd.MultiIndex):
+                results.index = flatten_multiindex(results.index)
+        return results
 
     @progress_decorator(description="Compute metric for each split")
     def _compute_metric_scores(
diff --git a/skore/src/skore/sklearn/_estimator/metrics_accessor.py b/skore/src/skore/sklearn/_estimator/metrics_accessor.py
index b2e5f1f6e..bd2049add 100644
--- a/skore/src/skore/sklearn/_estimator/metrics_accessor.py
+++ b/skore/src/skore/sklearn/_estimator/metrics_accessor.py
@@ -16,6 +16,7 @@
     RocCurveDisplay,
 )
 from skore.utils._accessor import _check_supported_ml_task
+from skore.utils._index import flatten_multiindex
 
 ###############################################################################
 # Metrics accessor
@@ -53,8 +54,9 @@ def report_metrics(
         y=None,
         scoring=None,
         scoring_names=None,
-        pos_label=None,
         scoring_kwargs=None,
+        pos_label=None,
+        flat_index=False,
     ):
         """Report a set of metrics for our estimator.
 
@@ -88,11 +90,14 @@ def report_metrics(
             Used to overwrite the default scoring names in the report. It should be of
             the same length as the `scoring` parameter.
 
+        scoring_kwargs : dict, default=None
+            The keyword arguments to pass to the scoring functions.
+
         pos_label : int, float, bool or str, default=None
             The positive class.
 
-        scoring_kwargs : dict, default=None
-            The keyword arguments to pass to the scoring functions.
+        flat_index : bool, default=False
+            Whether to flatten the multiindex columns.
 
         Returns
         -------
@@ -116,7 +121,7 @@ def report_metrics(
         ...     X_test=X_test,
         ...     y_test=y_test,
         ... )
-        >>> report.metrics.report_metrics(pos_label=1)
+        >>> report.metrics.report_metrics(pos_label=1, flatten_multiindex=False)
         Metric              Precision (↗︎)  Recall (↗︎)  ROC AUC (↗︎)  Brier score (↘︎)
         LogisticRegression        0.98...     0.93...      0.99...          0.03...
         """
@@ -265,7 +270,13 @@ def report_metrics(
                         names=name_index,
                     )
 
-        return pd.concat(scores, axis=1)
+        results = pd.concat(scores, axis=1)
+        if flat_index:
+            if isinstance(results.columns, pd.MultiIndex):
+                results.columns = flatten_multiindex(results.columns)
+            if isinstance(results.index, pd.MultiIndex):
+                results.index = flatten_multiindex(results.index)
+        return results
 
     def _compute_metric_scores(
         self,
diff --git a/skore/src/skore/utils/_index.py b/skore/src/skore/utils/_index.py
new file mode 100644
index 000000000..c91b7d3f8
--- /dev/null
+++ b/skore/src/skore/utils/_index.py
@@ -0,0 +1,66 @@
+import pandas as pd
+
+
+def flatten_multiindex(index: pd.MultiIndex) -> pd.Index:
+    """Flatten a pandas MultiIndex into a single-level Index.
+
+    Flatten a pandas MultiIndex into a single-level Index by joining the levels
+    with underscores. Empty strings are skipped when joining.
+
+    Parameters
+    ----------
+    index : pandas.MultiIndex
+        The `MultiIndex` to flatten.
+
+    Returns
+    -------
+    pandas.Index
+        A flattened `Index` with non-empty levels joined by underscores.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> mi = pd.MultiIndex.from_tuples(
+    ...     [('a', ''), ('b', '2')], names=['letter', 'number']
+    ... )
+    >>> flatten_multiindex(mi)
+    Index(['a', 'b_2'], dtype='object')
+    """
+    if not isinstance(index, pd.MultiIndex):
+        raise ValueError("`index` must be a MultiIndex.")
+
+    return pd.Index(["_".join(filter(bool, map(str, values))) for values in index])
+
+
+def unflatten_index(index: pd.Index, names: list[str] | None = None) -> pd.MultiIndex:
+    """Create a MultiIndex from a flat Index with underscore-separated values.
+
+    Convert a flat `Index` with underscore-separated values into a `MultiIndex`.
+
+    Parameters
+    ----------
+    index : pandas.Index
+        The flat Index with values separated by underscores.
+    names : list of str, optional
+        Names for the levels in the resulting MultiIndex. If None, levels will
+        be unnamed.
+
+    Returns
+    -------
+    pandas.MultiIndex
+        A MultiIndex with separate levels for each underscore-separated component.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> flat_idx = pd.Index(['a_1', 'b_2'])
+    >>> unflatten_index(flat_idx, names=['letter', 'number'])
+    MultiIndex([('a', '1'),
+               ('b', '2')],
+              names=['letter', 'number'])
+    """
+    if isinstance(index, pd.MultiIndex):
+        raise ValueError("`index` must be a flat Index.")
+
+    tuples = [tuple(val.split("_")) for val in index]
+    return pd.MultiIndex.from_tuples(tuples, names=names)
diff --git a/skore/tests/unit/utils/test_index.py b/skore/tests/unit/utils/test_index.py
new file mode 100644
index 000000000..7559b1b23
--- /dev/null
+++ b/skore/tests/unit/utils/test_index.py
@@ -0,0 +1,108 @@
+import pandas as pd
+import pytest
+from skore.utils._index import flatten_multiindex, unflatten_index
+
+
+@pytest.mark.parametrize(
+    "input_tuples, names, expected_values",
+    [
+        pytest.param(
+            [("a", 1), ("b", 2)], ["letter", "number"], ["a_1", "b_2"], id="basic"
+        ),
+        pytest.param(
+            [("a", 1, "x"), ("b", 2, "y")],
+            ["letter", "number", "symbol"],
+            ["a_1_x", "b_2_y"],
+            id="multiple_levels",
+        ),
+        pytest.param(
+            [("a", None), (None, 2)],
+            ["letter", "number"],
+            ["a_nan", "nan_2.0"],
+            id="none_values",
+        ),
+        pytest.param(
+            [("a@b", "1#2"), ("c&d", "3$4")],
+            ["letter", "number"],
+            ["a@b_1#2", "c&d_3$4"],
+            id="special_chars",
+        ),
+        pytest.param([], ["letter", "number"], [], id="empty"),
+    ],
+)
+def test_flatten_multiindex(input_tuples, names, expected_values):
+    """Test flatten_multiindex with various input cases."""
+    mi = pd.MultiIndex.from_tuples(input_tuples, names=names)
+    result = flatten_multiindex(mi)
+    expected = pd.Index(expected_values)
+    pd.testing.assert_index_equal(result, expected)
+
+
+def test_flatten_multiindex_invalid_input():
+    """Test that non-MultiIndex input raises ValueError."""
+    simple_index = pd.Index(["a", "b"])
+    with pytest.raises(ValueError, match="`index` must be a MultiIndex."):
+        flatten_multiindex(simple_index)
+
+
+@pytest.mark.parametrize(
+    "input_values, names, expected_tuples",
+    [
+        pytest.param(
+            ["a_1", "b_2"], ["letter", "number"], [("a", "1"), ("b", "2")], id="basic"
+        ),
+        pytest.param(
+            ["a_1_x", "b_2_y"],
+            ["letter", "number", "symbol"],
+            [("a", "1", "x"), ("b", "2", "y")],
+            id="multiple_components",
+        ),
+        pytest.param(
+            ["a_1", "b_2"], None, [("a", "1"), ("b", "2")], id="without_names"
+        ),
+        pytest.param(
+            ["a@b_1#2", "c&d_3$4"],
+            ["letter", "number"],
+            [("a@b", "1#2"), ("c&d", "3$4")],
+            id="special_chars",
+        ),
+        pytest.param([], ["letter", "number"], [], id="empty"),
+    ],
+)
+def test_unflatten_index(input_values, names, expected_tuples):
+    """Test unflatten_index with various input cases."""
+    flat_idx = pd.Index(input_values)
+    result = unflatten_index(flat_idx, names=names)
+    expected = pd.MultiIndex.from_tuples(expected_tuples, names=names)
+    pd.testing.assert_index_equal(result, expected)
+
+
+def test_unflatten_index_invalid_input():
+    """Test that MultiIndex input raises ValueError."""
+    mi = pd.MultiIndex.from_tuples([("a", "1"), ("b", "2")])
+    with pytest.raises(ValueError, match="`index` must be a flat Index."):
+        unflatten_index(mi)
+
+
+@pytest.mark.parametrize(
+    "input_values, names, expected_names",
+    [
+        pytest.param(
+            ["a_1", "b_2"],
+            ["letter", "number"],
+            ["letter", "number"],
+            id="matching_names",
+        ),
+        pytest.param(
+            ["a_1_x", "b_2_y"],
+            ["level0", "level1", "level2"],
+            ["level0", "level1", "level2"],
+            id="three_component_names",
+        ),
+    ],
+)
+def test_unflatten_index_mismatched_names(input_values, names, expected_names):
+    """Test unflatten_index with mismatched number of names."""
+    flat_idx = pd.Index(input_values)
+    result = unflatten_index(flat_idx, names=names)
+    assert result.names == expected_names

From 6736c00d579766ad747903174eb0dfcb93b10ec7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Tue, 18 Feb 2025 23:26:27 +0100
Subject: [PATCH 2/9] TST add tests

---
 .../sklearn/_estimator/metrics_accessor.py    |  2 +-
 .../unit/sklearn/test_cross_validation.py     | 25 +++++++++++++++++++
 skore/tests/unit/sklearn/test_estimator.py    | 22 ++++++++++++++++
 3 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/skore/src/skore/sklearn/_estimator/metrics_accessor.py b/skore/src/skore/sklearn/_estimator/metrics_accessor.py
index 2f41980b8..d39f9157e 100644
--- a/skore/src/skore/sklearn/_estimator/metrics_accessor.py
+++ b/skore/src/skore/sklearn/_estimator/metrics_accessor.py
@@ -331,7 +331,7 @@ def report_metrics(
                         names=name_index,
                     )
 
-        results = pd.concat(scores, axis=1)
+        results = pd.concat(scores, axis=0)
         if flat_index:
             if isinstance(results.columns, pd.MultiIndex):
                 results.columns = flatten_multi_index(results.columns)
diff --git a/skore/tests/unit/sklearn/test_cross_validation.py b/skore/tests/unit/sklearn/test_cross_validation.py
index 381cc9cc1..532b2e4ac 100644
--- a/skore/tests/unit/sklearn/test_cross_validation.py
+++ b/skore/tests/unit/sklearn/test_cross_validation.py
@@ -219,6 +219,31 @@ def test_cross_validation_report_pickle(tmp_path, binary_classification_data):
     joblib.dump(report, tmp_path / "report.joblib")
 
 
+def test_cross_validation_report_flat_index(binary_classification_data):
+    """Check that the index is flattened when `flat_index` is True.
+
+    Since `pos_label` is None, then by default a MultiIndex would be returned.
+    Here, we force to have a single-index by passing `flat_index=True`.
+    """
+    estimator, X, y = binary_classification_data
+    report = CrossValidationReport(estimator, X=X, y=y, cv_splitter=2)
+    result = report.metrics.report_metrics(flat_index=True)
+    assert result.shape == (6, 2)
+    assert isinstance(result.index, pd.Index)
+    assert result.index.tolist() == [
+        "Precision (↗︎)_0",
+        "Precision (↗︎)_1",
+        "Recall (↗︎)_0",
+        "Recall (↗︎)_1",
+        "ROC AUC (↗︎)",
+        "Brier score (↘︎)",
+    ]
+    assert result.columns.tolist() == [
+        "RandomForestClassifier_Split #0",
+        "RandomForestClassifier_Split #1",
+    ]
+
+
 ########################################################################################
 # Check the plot methods
 ########################################################################################
diff --git a/skore/tests/unit/sklearn/test_estimator.py b/skore/tests/unit/sklearn/test_estimator.py
index 24f0aae6a..4fc422b40 100644
--- a/skore/tests/unit/sklearn/test_estimator.py
+++ b/skore/tests/unit/sklearn/test_estimator.py
@@ -347,6 +347,28 @@ def test_estimator_report_pickle(tmp_path, binary_classification_data):
     joblib.dump(report, tmp_path / "report.joblib")
 
 
+def test_estimator_report_flat_index(binary_classification_data):
+    """Check that the index is flattened when `flat_index` is True.
+
+    Since `pos_label` is None, then by default a MultiIndex would be returned.
+    Here, we force to have a single-index by passing `flat_index=True`.
+    """
+    estimator, X_test, y_test = binary_classification_data
+    report = EstimatorReport(estimator, X_test=X_test, y_test=y_test)
+    result = report.metrics.report_metrics(flat_index=True)
+    assert result.shape == (6, 1)
+    assert isinstance(result.index, pd.Index)
+    assert result.index.tolist() == [
+        "Precision (↗︎)_0",
+        "Precision (↗︎)_1",
+        "Recall (↗︎)_0",
+        "Recall (↗︎)_1",
+        "ROC AUC (↗︎)",
+        "Brier score (↘︎)",
+    ]
+    assert result.columns.tolist() == ["RandomForestClassifier"]
+
+
 ########################################################################################
 # Check the plot methods
 ########################################################################################

From 30ed3353c6eea1b6fa8a6b008b7e842945d2cb6e Mon Sep 17 00:00:00 2001
From: "Thomas S." <thomas@probabl.ai>
Date: Wed, 19 Feb 2025 16:17:34 +0100
Subject: [PATCH 3/9] feat: Add `ComparisonReport` to compare instances of
 `EstimatorReport` (#1286)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- [x] Rename to `ComparisonReport`
- [x] Rebase on top of #1239 and adapt
- [x] Raise if `report.metrics.accuracy(data_source="train")` is called
with at least one EstimatorReport that does not have training data
- [x] Test
- [x] Docstrings
    - [x] MetricsAccessor
- [x] Move index column "#0" in front of each metric
- [x] Pass report names in comparator
- [ ]  ~Update plots legend~ see #1309
- The actual `RocCurveDisplay` needs a full refactor to be splitted by
use-case: estimator report, cross-validation report and finally
comparison report. In each of these use-cases, there is two scenarios
with binary classification and multi-class classification. Otherwise, it
will be unmaintainable.
- [ ] ~Investigate missing metrics in `report_metrics`~ **(deferred to
future PR)**
- The logic is split between `report_metrics` and `available_if`; it
should be merged (ideally everything in `available_if`?)
- [ ] ~Refactor to make `CrossValidationReport` depend on it~
**(deferred to future PR)**
- [x] ~Change EstimatorReport `repr`?~ Issue
https://github.com/probabl-ai/skore/issues/1293

Closes #1245

Co-authored-by: Auguste <auguste@probabl.ai>
Co-authored-by: Sylvain Combettes <48064216+sylvaincom@users.noreply.github.com>
---
 README.md                                     |    8 +-
 .../plot_skore_getting_started.py             |   68 +-
 examples/use_cases/plot_employee_salaries.py  |    5 +
 skore/src/skore/__init__.py                   |    2 +
 skore/src/skore/sklearn/__init__.py           |    2 +
 skore/src/skore/sklearn/_base.py              |    2 +-
 .../src/skore/sklearn/_comparison/__init__.py |    7 +
 .../sklearn/_comparison/metrics_accessor.py   | 1080 +++++++++++++++++
 skore/src/skore/sklearn/_comparison/report.py |  170 +++
 .../_cross_validation/metrics_accessor.py     |   19 +-
 .../skore/sklearn/_cross_validation/report.py |    3 +-
 .../sklearn/_estimator/metrics_accessor.py    |   22 +-
 skore/tests/unit/sklearn/test_comparison.py   |  536 ++++++++
 skore/tests/unit/sklearn/test_estimator.py    |    7 +-
 sphinx/api/skore.config_context.rst           |   10 +
 sphinx/api/skore.get_config.rst               |   10 +
 sphinx/api/skore.set_config.rst               |   10 +
 sphinx/index.rst                              |    4 +-
 sphinx/reference/report/comparison_report.rst |   50 +
 sphinx/reference/report/index.rst             |   12 +
 20 files changed, 1986 insertions(+), 41 deletions(-)
 create mode 100644 skore/src/skore/sklearn/_comparison/__init__.py
 create mode 100644 skore/src/skore/sklearn/_comparison/metrics_accessor.py
 create mode 100644 skore/src/skore/sklearn/_comparison/report.py
 create mode 100644 skore/tests/unit/sklearn/test_comparison.py
 create mode 100644 sphinx/api/skore.config_context.rst
 create mode 100644 sphinx/api/skore.get_config.rst
 create mode 100644 sphinx/api/skore.set_config.rst
 create mode 100644 sphinx/reference/report/comparison_report.rst

diff --git a/README.md b/README.md
index dfa9fca31..6248eaa3d 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,8 @@ skore is a Python open-source library designed to help data scientists apply rec
   - `train_test_split` supercharged with methodological guidance: the API is the same as scikit-learn's, but skore displays warnings when applicable. For example, it warns you against shuffling time series data or when you have class imbalance.
 - **Evaluate**: automated insightful reports.
   - `EstimatorReport`: feed your scikit-learn compatible estimator and dataset, and it generates recommended metrics and plots to help you analyze your estimator. All these are computed and generated for you in 1 line of code. Under the hood, we use efficient caching to make the computations blazing fast.
-  - `CrossValidationReport`: Get a skore estimator report for each fold of your cross-validation.
+  - `CrossValidationReport`: get a skore estimator report for each fold of your cross-validation.
+  - `ComparisonReport`: benchmark your skore estimator reports.
 
 ## What's next?
 
@@ -91,7 +92,7 @@ You can find information on the latest version [here](https://anaconda.org/conda
     ```python
     # Display the ROC curve that was generated for you:
     roc_plot = cv_report.metrics.roc()
-    roc_plot
+    roc_plot.plot()
     ```
 
 1. Store your results for safe-keeping.
@@ -109,7 +110,8 @@ You can find information on the latest version [here](https://anaconda.org/conda
 
     ```python
     # Get your results
-    df_get = my_project.put("df_cv_report_metrics")
+    df_get = my_project.get("df_cv_report_metrics")
+    df_get
     ```
 
 Learn more in our [documentation](https://skore.probabl.ai).
diff --git a/examples/getting_started/plot_skore_getting_started.py b/examples/getting_started/plot_skore_getting_started.py
index ce92bbfab..1be7e1c0b 100644
--- a/examples/getting_started/plot_skore_getting_started.py
+++ b/examples/getting_started/plot_skore_getting_started.py
@@ -17,6 +17,8 @@
 #       *   :class:`skore.CrossValidationReport`: get an insightful report on your
 #           cross-validation results
 #
+#       *   :class:`skore.ComparisonReport`: benchmark your skore estimator reports
+#
 #       *   :func:`skore.train_test_split`: get diagnostics when splitting your data
 #
 # #.    Track your ML/DS results using skore's :class:`~skore.Project`
@@ -50,10 +52,10 @@
 X, y = make_classification(n_classes=2, n_samples=100_000, n_informative=4)
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 
-clf = LogisticRegression(random_state=0)
+log_reg = LogisticRegression(random_state=0)
 
-est_report = EstimatorReport(
-    clf, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test
+log_reg_report = EstimatorReport(
+    log_reg, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test
 )
 
 # %%
@@ -61,14 +63,14 @@
 # (skore detected that we are doing binary classification):
 
 # %%
-est_report.help()
+log_reg_report.help()
 
 # %%
 # We can get the report metrics that was computed for us:
 
 # %%
-df_est_report_metrics = est_report.metrics.report_metrics()
-df_est_report_metrics
+df_log_reg_report_metrics = log_reg_report.metrics.report_metrics()
+df_log_reg_report_metrics
 
 # %%
 # We can also plot the ROC curve that was generated for us:
@@ -76,7 +78,7 @@
 # %%
 import matplotlib.pyplot as plt
 
-roc_plot = est_report.metrics.roc()
+roc_plot = log_reg_report.metrics.roc()
 roc_plot.plot()
 plt.tight_layout()
 
@@ -97,7 +99,7 @@
 # %%
 from skore import CrossValidationReport
 
-cv_report = CrossValidationReport(clf, X, y, cv_splitter=5)
+cv_report = CrossValidationReport(log_reg, X, y, cv_splitter=5)
 
 # %%
 # We display the cross-validation report helper:
@@ -125,9 +127,9 @@
 # for example the first fold:
 
 # %%
-est_report_fold = cv_report.estimator_reports_[0]
-df_report_metrics_fold = est_report_fold.metrics.report_metrics()
-df_report_metrics_fold
+log_reg_report_fold = cv_report.estimator_reports_[0]
+df_log_reg_report_fold_metrics = log_reg_report_fold.metrics.report_metrics()
+df_log_reg_report_fold_metrics
 
 # %%
 # .. seealso::
@@ -135,6 +137,50 @@
 #   For more information about the motivation and usage of
 #   :class:`skore.CrossValidationReport`, see :ref:`example_use_case_employee_salaries`.
 
+# %%
+# Comparing estimators reports
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# :class:`skore.ComparisonReport` enables users to compare several estimator reports
+# (corresponding to several estimators) on a same test set, as in a benchmark of
+# estimators.
+#
+# Apart from the previous ``log_reg_report``, let use define another estimator report:
+
+# %%
+from sklearn.ensemble import RandomForestClassifier
+
+rf = RandomForestClassifier(max_depth=2, random_state=0)
+rf_report = EstimatorReport(
+    rf, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test
+)
+
+# %%
+# Now, let us compare these two estimator reports, that were applied to the exact
+# same test set:
+
+# %%
+from skore import ComparisonReport
+
+comparator = ComparisonReport(reports=[log_reg_report, rf_report])
+
+# %%
+# As for the :class:`~skore.EstimatorReport` and the
+# :class:`~skore.CrossValidationReport`, we have a helper:
+
+# %%
+comparator.help()
+
+# %%
+# Let us display the result of our benchmark:
+
+# %%
+benchmark_metrics = comparator.metrics.report_metrics()
+benchmark_metrics
+
+# %%
+# We have the result of our benchmark.
+
 # %%
 # Train-test split with skore
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/examples/use_cases/plot_employee_salaries.py b/examples/use_cases/plot_employee_salaries.py
index 64489cb00..c133270ca 100644
--- a/examples/use_cases/plot_employee_salaries.py
+++ b/examples/use_cases/plot_employee_salaries.py
@@ -298,6 +298,11 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 )
 results
 
+# %%
+# .. note::
+#   We could have also used the :class:`skore.ComparisonReport` to compare estimator
+#   reports.
+
 # %%
 #
 # Finally, we can even get the individual :class:`~skore.EstimatorReport` for each fold
diff --git a/skore/src/skore/__init__.py b/skore/src/skore/__init__.py
index c4c7d1c04..b8d06f5cf 100644
--- a/skore/src/skore/__init__.py
+++ b/skore/src/skore/__init__.py
@@ -8,6 +8,7 @@
 from skore._config import config_context, get_config, set_config
 from skore.project import Project, open
 from skore.sklearn import (
+    ComparisonReport,
     CrossValidationReport,
     EstimatorReport,
     PrecisionRecallCurveDisplay,
@@ -20,6 +21,7 @@
 
 __all__ = [
     "CrossValidationReport",
+    "ComparisonReport",
     "EstimatorReport",
     "PrecisionRecallCurveDisplay",
     "PredictionErrorDisplay",
diff --git a/skore/src/skore/sklearn/__init__.py b/skore/src/skore/sklearn/__init__.py
index 0b5858999..f1abb357c 100644
--- a/skore/src/skore/sklearn/__init__.py
+++ b/skore/src/skore/sklearn/__init__.py
@@ -1,5 +1,6 @@
 """Enhance `sklearn` functions."""
 
+from skore.sklearn._comparison import ComparisonReport
 from skore.sklearn._cross_validation import CrossValidationReport
 from skore.sklearn._estimator import EstimatorReport
 from skore.sklearn._plot import (
@@ -13,6 +14,7 @@
     "train_test_split",
     "CrossValidationReport",
     "EstimatorReport",
+    "ComparisonReport",
     "RocCurveDisplay",
     "PrecisionRecallCurveDisplay",
     "PredictionErrorDisplay",
diff --git a/skore/src/skore/sklearn/_base.py b/skore/src/skore/sklearn/_base.py
index d62963cb9..c23290b38 100644
--- a/skore/src/skore/sklearn/_base.py
+++ b/skore/src/skore/sklearn/_base.py
@@ -124,7 +124,7 @@ def _get_attributes_for_help(self):
 
     def _create_help_tree(self):
         """Create a rich Tree with the available tools and accessor methods."""
-        tree = Tree("report")
+        tree = Tree(self.__class__.__name__)
 
         # Add accessor methods first
         for accessor_attr, config in self._ACCESSOR_CONFIG.items():
diff --git a/skore/src/skore/sklearn/_comparison/__init__.py b/skore/src/skore/sklearn/_comparison/__init__.py
new file mode 100644
index 000000000..eb72e33f8
--- /dev/null
+++ b/skore/src/skore/sklearn/_comparison/__init__.py
@@ -0,0 +1,7 @@
+from skore.externals._pandas_accessors import _register_accessor
+from skore.sklearn._comparison.metrics_accessor import _MetricsAccessor
+from skore.sklearn._comparison.report import ComparisonReport
+
+_register_accessor("metrics", ComparisonReport)(_MetricsAccessor)
+
+__all__ = ["ComparisonReport"]
diff --git a/skore/src/skore/sklearn/_comparison/metrics_accessor.py b/skore/src/skore/sklearn/_comparison/metrics_accessor.py
new file mode 100644
index 000000000..7f07a3d0c
--- /dev/null
+++ b/skore/src/skore/sklearn/_comparison/metrics_accessor.py
@@ -0,0 +1,1080 @@
+import joblib
+import numpy as np
+import pandas as pd
+from sklearn.metrics import make_scorer
+from sklearn.utils.metaestimators import available_if
+
+from skore.externals._pandas_accessors import DirNamesMixin
+from skore.sklearn._base import _BaseAccessor
+from skore.utils._accessor import _check_supported_ml_task
+from skore.utils._progress_bar import progress_decorator
+
+
+class _MetricsAccessor(_BaseAccessor, DirNamesMixin):
+    """Accessor for metrics-related operations.
+
+    You can access this accessor using the `metrics` attribute.
+    """
+
+    _SCORE_OR_LOSS_INFO = {
+        "accuracy": {"name": "Accuracy", "icon": "(↗︎)"},
+        "precision": {"name": "Precision", "icon": "(↗︎)"},
+        "recall": {"name": "Recall", "icon": "(↗︎)"},
+        "brier_score": {"name": "Brier score", "icon": "(↘︎)"},
+        "roc_auc": {"name": "ROC AUC", "icon": "(↗︎)"},
+        "log_loss": {"name": "Log loss", "icon": "(↘︎)"},
+        "r2": {"name": "R²", "icon": "(↗︎)"},
+        "rmse": {"name": "RMSE", "icon": "(↘︎)"},
+        "custom_metric": {"name": "Custom metric", "icon": ""},
+        "report_metrics": {"name": "Report metrics", "icon": ""},
+    }
+
+    def __init__(self, parent):
+        super().__init__(parent)
+
+        self._parent_progress = None
+
+    def report_metrics(
+        self,
+        *,
+        data_source="test",
+        X=None,
+        y=None,
+        scoring=None,
+        scoring_names=None,
+        pos_label=None,
+        scoring_kwargs=None,
+    ):
+        """Report a set of metrics for the estimators.
+
+        Parameters
+        ----------
+        data_source : {"test", "train", "X_y"}, default="test"
+            The data source to use.
+
+            - "test" : use the test set provided when creating the report.
+            - "train" : use the train set provided when creating the report.
+            - "X_y" : use the provided `X` and `y` to compute the metric.
+
+        X : array-like of shape (n_samples, n_features), default=None
+            New data on which to compute the metric. By default, we use the validation
+            set provided when creating the report.
+
+        y : array-like of shape (n_samples,), default=None
+            New target on which to compute the metric. By default, we use the target
+            provided when creating the report.
+
+        scoring : list of str, callable, or scorer, default=None
+            The metrics to report. You can get the possible list of strings by calling
+            `report.metrics.help()`. When passing a callable, it should take as
+            arguments ``y_true``, ``y_pred`` as the two first arguments. Additional
+            arguments can be passed as keyword arguments and will be forwarded with
+            `scoring_kwargs`. If the callable API is too restrictive (e.g. need to pass
+            same parameter name with different values), you can use scikit-learn scorers
+            as provided by :func:`sklearn.metrics.make_scorer`.
+
+        scoring_names : list of str, default=None
+            Used to overwrite the default scoring names in the report. It should be of
+            the same length as the ``scoring`` parameter.
+
+        pos_label : int, float, bool or str, default=None
+            The positive class.
+
+        scoring_kwargs : dict, default=None
+            The keyword arguments to pass to the scoring functions.
+
+        Returns
+        -------
+        pd.DataFrame
+            The statistics for the metrics.
+
+        Examples
+        --------
+        >>> from sklearn.datasets import load_breast_cancer
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.model_selection import train_test_split
+        >>> from skore import ComparisonReport, EstimatorReport
+        >>> X, y = load_breast_cancer(return_X_y=True)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+        >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42)
+        >>> estimator_report_1 = EstimatorReport(
+        ...     estimator_1,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43)
+        >>> estimator_report_2 = EstimatorReport(
+        ...     estimator_2,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> comparison_report = ComparisonReport(
+        ...     [estimator_report_1, estimator_report_2]
+        ... )
+        >>> comparison_report.metrics.report_metrics(
+        ...     scoring=["precision", "recall"],
+        ...     pos_label=1,
+        ... )
+        Estimator       LogisticRegression  LogisticRegression
+        Metric
+        Precision (↗︎)              0.96...             0.96...
+        Recall (↗︎)                 0.97...             0.97...
+        """
+        return self._compute_metric_scores(
+            report_metric_name="report_metrics",
+            data_source=data_source,
+            X=X,
+            y=y,
+            scoring=scoring,
+            pos_label=pos_label,
+            scoring_kwargs=scoring_kwargs,
+            scoring_names=scoring_names,
+        )
+
+    @progress_decorator(description="Compute metric for each split")
+    def _compute_metric_scores(
+        self,
+        report_metric_name,
+        *,
+        data_source="test",
+        X=None,
+        y=None,
+        **metric_kwargs,
+    ):
+        cache_key = (self._parent._hash, report_metric_name, data_source)
+
+        # we need to enforce the order of the parameter for a specific metric
+        # to make sure that we hit the cache in a consistent way
+        ordered_metric_kwargs = sorted(metric_kwargs.keys())
+
+        for key in ordered_metric_kwargs:
+            if isinstance(metric_kwargs[key], (np.ndarray, list, dict)):
+                cache_key += (joblib.hash(metric_kwargs[key]),)
+            else:
+                cache_key += (metric_kwargs[key],)
+
+        progress = self._progress_info["current_progress"]
+        main_task = self._progress_info["current_task"]
+
+        total_estimators = len(self._parent.estimator_reports_)
+        progress.update(main_task, total=total_estimators)
+
+        if cache_key in self._parent._cache:
+            results = self._parent._cache[cache_key]
+        else:
+            parallel = joblib.Parallel(
+                n_jobs=self._parent.n_jobs,
+                return_as="generator",
+                require="sharedmem",
+            )
+            generator = parallel(
+                joblib.delayed(getattr(report.metrics, report_metric_name))(
+                    data_source=data_source,
+                    X=X,
+                    y=y,
+                    **metric_kwargs,
+                )
+                for report in self._parent.estimator_reports_
+            )
+            results = []
+            for result in generator:
+                results.append(result)
+                progress.update(main_task, advance=1, refresh=True)
+
+            results = pd.concat(results, axis=1)
+            results.columns = pd.Index(self._parent.report_names_, name="Estimator")
+
+            self._parent._cache[cache_key] = results
+        return results
+
+    @available_if(
+        _check_supported_ml_task(
+            supported_ml_tasks=["binary-classification", "multiclass-classification"]
+        )
+    )
+    def accuracy(self, *, data_source="test", X=None, y=None):
+        """Compute the accuracy score.
+
+        Parameters
+        ----------
+        data_source : {"test", "train", "X_y"}, default="test"
+            The data source to use.
+
+            - "test" : use the test set provided when creating the report.
+            - "train" : use the train set provided when creating the report.
+            - "X_y" : use the provided `X` and `y` to compute the metric.
+
+        X : array-like of shape (n_samples, n_features), default=None
+            New data on which to compute the metric. By default, we use the validation
+            set provided when creating the report.
+
+        y : array-like of shape (n_samples,), default=None
+            New target on which to compute the metric. By default, we use the target
+            provided when creating the report.
+
+        Returns
+        -------
+        pd.DataFrame
+            The accuracy score.
+
+        Examples
+        --------
+        >>> from sklearn.datasets import load_breast_cancer
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.model_selection import train_test_split
+        >>> from skore import ComparisonReport, EstimatorReport
+        >>> X, y = load_breast_cancer(return_X_y=True)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+        >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42)
+        >>> estimator_report_1 = EstimatorReport(
+        ...     estimator_1,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43)
+        >>> estimator_report_2 = EstimatorReport(
+        ...     estimator_2,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> comparison_report = ComparisonReport(
+        ...     [estimator_report_1, estimator_report_2]
+        ... )
+        >>> comparison_report.metrics.accuracy()
+        Estimator      LogisticRegression  LogisticRegression
+        Metric
+        Accuracy (↗︎)              0.96...             0.96...
+        """
+        return self.report_metrics(
+            scoring=["accuracy"],
+            data_source=data_source,
+            X=X,
+            y=y,
+        )
+
+    @available_if(
+        _check_supported_ml_task(
+            supported_ml_tasks=["binary-classification", "multiclass-classification"]
+        )
+    )
+    def precision(
+        self,
+        *,
+        data_source="test",
+        X=None,
+        y=None,
+        average=None,
+        pos_label=None,
+    ):
+        """Compute the precision score.
+
+        Parameters
+        ----------
+        data_source : {"test", "train", "X_y"}, default="test"
+            The data source to use.
+
+            - "test" : use the test set provided when creating the report.
+            - "train" : use the train set provided when creating the report.
+            - "X_y" : use the provided `X` and `y` to compute the metric.
+
+        X : array-like of shape (n_samples, n_features), default=None
+            New data on which to compute the metric. By default, we use the validation
+            set provided when creating the report.
+
+        y : array-like of shape (n_samples,), default=None
+            New target on which to compute the metric. By default, we use the target
+            provided when creating the report.
+
+        average : {"binary", "macro", "micro", "weighted", "samples"} or None, \
+                default=None
+            Used with multiclass problems.
+            If `None`, the metrics for each class are returned. Otherwise, this
+            determines the type of averaging performed on the data:
+
+            - "binary": Only report results for the class specified by `pos_label`.
+              This is applicable only if targets (`y_{true,pred}`) are binary.
+            - "micro": Calculate metrics globally by counting the total true positives,
+              false negatives and false positives.
+            - "macro": Calculate metrics for each label, and find their unweighted
+              mean.  This does not take label imbalance into account.
+            - "weighted": Calculate metrics for each label, and find their average
+              weighted by support (the number of true instances for each label). This
+              alters 'macro' to account for label imbalance; it can result in an F-score
+              that is not between precision and recall.
+            - "samples": Calculate metrics for each instance, and find their average
+              (only meaningful for multilabel classification where this differs from
+              :func:`accuracy_score`).
+
+            .. note::
+                If `pos_label` is specified and `average` is None, then we report
+                only the statistics of the positive class (i.e. equivalent to
+                `average="binary"`).
+
+        pos_label : int, float, bool or str, default=None
+            The positive class.
+
+        Returns
+        -------
+        pd.DataFrame
+            The precision score.
+
+        Examples
+        --------
+        >>> from sklearn.datasets import load_breast_cancer
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.model_selection import train_test_split
+        >>> from skore import ComparisonReport, EstimatorReport
+        >>> X, y = load_breast_cancer(return_X_y=True)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+        >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42)
+        >>> estimator_report_1 = EstimatorReport(
+        ...     estimator_1,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43)
+        >>> estimator_report_2 = EstimatorReport(
+        ...     estimator_2,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> comparison_report = ComparisonReport(
+        ...     [estimator_report_1, estimator_report_2]
+        ... )
+        >>> comparison_report.metrics.precision()
+        Estimator                    LogisticRegression  LogisticRegression
+        Metric      Label / Average
+        Precision (↗︎)             0             0.96...             0.96...
+                                  1             0.96...             0.96...
+        """
+        return self.report_metrics(
+            scoring=["precision"],
+            data_source=data_source,
+            X=X,
+            y=y,
+            pos_label=pos_label,
+            scoring_kwargs={"average": average},
+        )
+
+    @available_if(
+        _check_supported_ml_task(
+            supported_ml_tasks=["binary-classification", "multiclass-classification"]
+        )
+    )
+    def recall(
+        self,
+        *,
+        data_source="test",
+        X=None,
+        y=None,
+        average=None,
+        pos_label=None,
+    ):
+        """Compute the recall score.
+
+        Parameters
+        ----------
+        data_source : {"test", "train"}, default="test"
+            The data source to use.
+
+            - "test" : use the test set provided when creating the report.
+            - "train" : use the train set provided when creating the report.
+            - "X_y" : use the provided `X` and `y` to compute the metric.
+
+        X : array-like of shape (n_samples, n_features), default=None
+            New data on which to compute the metric. By default, we use the validation
+            set provided when creating the report.
+
+        y : array-like of shape (n_samples,), default=None
+            New target on which to compute the metric. By default, we use the target
+            provided when creating the report.
+
+        average : {"binary","macro", "micro", "weighted", "samples"} or None, \
+                default=None
+            Used with multiclass problems.
+            If `None`, the metrics for each class are returned. Otherwise, this
+            determines the type of averaging performed on the data:
+
+            - "binary": Only report results for the class specified by `pos_label`.
+              This is applicable only if targets (`y_{true,pred}`) are binary.
+            - "micro": Calculate metrics globally by counting the total true positives,
+              false negatives and false positives.
+            - "macro": Calculate metrics for each label, and find their unweighted
+              mean.  This does not take label imbalance into account.
+            - "weighted": Calculate metrics for each label, and find their average
+              weighted by support (the number of true instances for each label). This
+              alters 'macro' to account for label imbalance; it can result in an F-score
+              that is not between precision and recall. Weighted recall is equal to
+              accuracy.
+            - "samples": Calculate metrics for each instance, and find their average
+              (only meaningful for multilabel classification where this differs from
+              :func:`accuracy_score`).
+
+            .. note::
+                If `pos_label` is specified and `average` is None, then we report
+                only the statistics of the positive class (i.e. equivalent to
+                `average="binary"`).
+
+        pos_label : int, float, bool or str, default=None
+            The positive class.
+
+        Returns
+        -------
+        pd.DataFrame
+            The recall score.
+
+        Examples
+        --------
+        >>> from sklearn.datasets import load_breast_cancer
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.model_selection import train_test_split
+        >>> from skore import ComparisonReport, EstimatorReport
+        >>> X, y = load_breast_cancer(return_X_y=True)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+        >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42)
+        >>> estimator_report_1 = EstimatorReport(
+        ...     estimator_1,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43)
+        >>> estimator_report_2 = EstimatorReport(
+        ...     estimator_2,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> comparison_report = ComparisonReport(
+        ...     [estimator_report_1, estimator_report_2]
+        ... )
+        >>> comparison_report.metrics.recall()
+        Estimator                    LogisticRegression  LogisticRegression
+        Metric      Label / Average
+        Recall (↗︎)                0            0.944...            0.944...
+                                  1            0.977...            0.977...
+        """
+        return self.report_metrics(
+            scoring=["recall"],
+            data_source=data_source,
+            X=X,
+            y=y,
+            pos_label=pos_label,
+            scoring_kwargs={"average": average},
+        )
+
+    @available_if(
+        _check_supported_ml_task(supported_ml_tasks=["binary-classification"])
+    )
+    def brier_score(
+        self,
+        *,
+        data_source="test",
+        X=None,
+        y=None,
+    ):
+        """Compute the Brier score.
+
+        Parameters
+        ----------
+        data_source : {"test", "train"}, default="test"
+            The data source to use.
+
+            - "test" : use the test set provided when creating the report.
+            - "train" : use the train set provided when creating the report.
+            - "X_y" : use the provided `X` and `y` to compute the metric.
+
+        X : array-like of shape (n_samples, n_features), default=None
+            New data on which to compute the metric. By default, we use the validation
+            set provided when creating the report.
+
+        y : array-like of shape (n_samples,), default=None
+            New target on which to compute the metric. By default, we use the target
+            provided when creating the report.
+
+        Returns
+        -------
+        pd.DataFrame
+            The Brier score.
+
+        Examples
+        --------
+        >>> from sklearn.datasets import load_breast_cancer
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.model_selection import train_test_split
+        >>> from skore import ComparisonReport, EstimatorReport
+        >>> X, y = load_breast_cancer(return_X_y=True)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+        >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42)
+        >>> estimator_report_1 = EstimatorReport(
+        ...     estimator_1,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43)
+        >>> estimator_report_2 = EstimatorReport(
+        ...     estimator_2,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> comparison_report = ComparisonReport(
+        ...     [estimator_report_1, estimator_report_2]
+        ... )
+        >>> comparison_report.metrics.brier_score()
+        Estimator         LogisticRegression  LogisticRegression
+        Metric
+        Brier score (↘︎)              0.025...            0.025...
+        """
+        return self.report_metrics(
+            scoring=["brier_score"],
+            data_source=data_source,
+            X=X,
+            y=y,
+        )
+
+    @available_if(
+        _check_supported_ml_task(
+            supported_ml_tasks=["binary-classification", "multiclass-classification"]
+        )
+    )
+    def roc_auc(
+        self,
+        *,
+        data_source="test",
+        X=None,
+        y=None,
+        average=None,
+        multi_class="ovr",
+    ):
+        """Compute the ROC AUC score.
+
+        Parameters
+        ----------
+        data_source : {"test", "train"}, default="test"
+            The data source to use.
+
+            - "test" : use the test set provided when creating the report.
+            - "train" : use the train set provided when creating the report.
+            - "X_y" : use the provided `X` and `y` to compute the metric.
+
+        X : array-like of shape (n_samples, n_features), default=None
+            New data on which to compute the metric. By default, we use the validation
+            set provided when creating the report.
+
+        y : array-like of shape (n_samples,), default=None
+            New target on which to compute the metric. By default, we use the target
+            provided when creating the report.
+
+        average : {"auto", "macro", "micro", "weighted", "samples"}, \
+                default=None
+            Average to compute the ROC AUC score in a multiclass setting. By default,
+            no average is computed. Otherwise, this determines the type of averaging
+            performed on the data.
+
+            - "micro": Calculate metrics globally by considering each element of
+              the label indicator matrix as a label.
+            - "macro": Calculate metrics for each label, and find their unweighted
+              mean. This does not take label imbalance into account.
+            - "weighted": Calculate metrics for each label, and find their average,
+              weighted by support (the number of true instances for each label).
+            - "samples": Calculate metrics for each instance, and find their
+              average.
+
+            .. note::
+                Multiclass ROC AUC currently only handles the "macro" and
+                "weighted" averages. For multiclass targets, `average=None` is only
+                implemented for `multi_class="ovr"` and `average="micro"` is only
+                implemented for `multi_class="ovr"`.
+
+        multi_class : {"raise", "ovr", "ovo"}, default="ovr"
+            The multi-class strategy to use.
+
+            - "raise": Raise an error if the data is multiclass.
+            - "ovr": Stands for One-vs-rest. Computes the AUC of each class against the
+              rest. This treats the multiclass case in the same way as the multilabel
+              case. Sensitive to class imbalance even when `average == "macro"`,
+              because class imbalance affects the composition of each of the "rest"
+              groupings.
+            - "ovo": Stands for One-vs-one. Computes the average AUC of all possible
+              pairwise combinations of classes. Insensitive to class imbalance when
+              `average == "macro"`.
+
+        Returns
+        -------
+        pd.DataFrame
+            The ROC AUC score.
+
+        Examples
+        --------
+        >>> from sklearn.datasets import load_breast_cancer
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.model_selection import train_test_split
+        >>> from skore import ComparisonReport, EstimatorReport
+        >>> X, y = load_breast_cancer(return_X_y=True)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+        >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42)
+        >>> estimator_report_1 = EstimatorReport(
+        ...     estimator_1,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43)
+        >>> estimator_report_2 = EstimatorReport(
+        ...     estimator_2,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> comparison_report = ComparisonReport(
+        ...     [estimator_report_1, estimator_report_2]
+        ... )
+        >>> comparison_report.metrics.roc_auc()
+        Estimator      LogisticRegression  LogisticRegression
+        Metric
+        ROC AUC (↗︎)               0.99...             0.99...
+        """
+        return self.report_metrics(
+            scoring=["roc_auc"],
+            data_source=data_source,
+            X=X,
+            y=y,
+            scoring_kwargs={"average": average, "multi_class": multi_class},
+        )
+
+    @available_if(
+        _check_supported_ml_task(
+            supported_ml_tasks=["binary-classification", "multiclass-classification"]
+        )
+    )
+    def log_loss(
+        self,
+        *,
+        data_source="test",
+        X=None,
+        y=None,
+    ):
+        """Compute the log loss.
+
+        Parameters
+        ----------
+        data_source : {"test", "train"}, default="test"
+            The data source to use.
+
+            - "test" : use the test set provided when creating the report.
+            - "train" : use the train set provided when creating the report.
+            - "X_y" : use the provided `X` and `y` to compute the metric.
+
+        X : array-like of shape (n_samples, n_features), default=None
+            New data on which to compute the metric. By default, we use the validation
+            set provided when creating the report.
+
+        y : array-like of shape (n_samples,), default=None
+            New target on which to compute the metric. By default, we use the target
+            provided when creating the report.
+
+        Returns
+        -------
+        pd.DataFrame
+            The log-loss.
+
+        Examples
+        --------
+        >>> from sklearn.datasets import load_breast_cancer
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.model_selection import train_test_split
+        >>> from skore import ComparisonReport, EstimatorReport
+        >>> X, y = load_breast_cancer(return_X_y=True)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+        >>> estimator_1 = LogisticRegression(max_iter=10000, random_state=42)
+        >>> estimator_report_1 = EstimatorReport(
+        ...     estimator_1,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> estimator_2 = LogisticRegression(max_iter=10000, random_state=43)
+        >>> estimator_report_2 = EstimatorReport(
+        ...     estimator_2,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> comparison_report = ComparisonReport(
+        ...     [estimator_report_1, estimator_report_2]
+        ... )
+        >>> comparison_report.metrics.log_loss()
+        Estimator      LogisticRegression  LogisticRegression
+        Metric
+        Log loss (↘︎)             0.082...            0.082...
+        """
+        return self.report_metrics(
+            scoring=["log_loss"],
+            data_source=data_source,
+            X=X,
+            y=y,
+        )
+
+    @available_if(_check_supported_ml_task(supported_ml_tasks=["regression"]))
+    def r2(
+        self,
+        *,
+        data_source="test",
+        X=None,
+        y=None,
+        multioutput="raw_values",
+    ):
+        """Compute the R² score.
+
+        Parameters
+        ----------
+        data_source : {"test", "train"}, default="test"
+            The data source to use.
+
+            - "test" : use the test set provided when creating the report.
+            - "train" : use the train set provided when creating the report.
+            - "X_y" : use the provided `X` and `y` to compute the metric.
+
+        X : array-like of shape (n_samples, n_features), default=None
+            New data on which to compute the metric. By default, we use the validation
+            set provided when creating the report.
+
+        y : array-like of shape (n_samples,), default=None
+            New target on which to compute the metric. By default, we use the target
+            provided when creating the report.
+
+        multioutput : {"raw_values", "uniform_average"} or array-like of shape \
+                (n_outputs,), default="raw_values"
+            Defines aggregating of multiple output values. Array-like value defines
+            weights used to average errors. The other possible values are:
+
+            - "raw_values": Returns a full set of errors in case of multioutput input.
+            - "uniform_average": Errors of all outputs are averaged with uniform weight.
+
+            By default, no averaging is done.
+
+        Returns
+        -------
+        pd.DataFrame
+            The R² score.
+
+        Examples
+        --------
+        >>> from sklearn.datasets import load_diabetes
+        >>> from sklearn.linear_model import Ridge
+        >>> from sklearn.model_selection import train_test_split
+        >>> from skore import ComparisonReport, EstimatorReport
+        >>> X, y = load_diabetes(return_X_y=True)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+        >>> estimator_1 = Ridge(random_state=42)
+        >>> estimator_report_1 = EstimatorReport(
+        ...     estimator_1,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> estimator_2 = Ridge(random_state=43)
+        >>> estimator_report_2 = EstimatorReport(
+        ...     estimator_2,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> comparison_report = ComparisonReport(
+        ...     [estimator_report_1, estimator_report_2]
+        ... )
+        >>> comparison_report.metrics.r2()
+        Estimator     Ridge    Ridge
+        Metric
+        R² (↗︎)      0.43...  0.43...
+        """
+        return self.report_metrics(
+            scoring=["r2"],
+            data_source=data_source,
+            X=X,
+            y=y,
+            scoring_kwargs={"multioutput": multioutput},
+        )
+
+    @available_if(_check_supported_ml_task(supported_ml_tasks=["regression"]))
+    def rmse(
+        self,
+        *,
+        data_source="test",
+        X=None,
+        y=None,
+        multioutput="raw_values",
+    ):
+        """Compute the root mean squared error.
+
+        Parameters
+        ----------
+        data_source : {"test", "train"}, default="test"
+            The data source to use.
+
+            - "test" : use the test set provided when creating the report.
+            - "train" : use the train set provided when creating the report.
+            - "X_y" : use the provided `X` and `y` to compute the metric.
+
+        X : array-like of shape (n_samples, n_features), default=None
+            New data on which to compute the metric. By default, we use the validation
+            set provided when creating the report.
+
+        y : array-like of shape (n_samples,), default=None
+            New target on which to compute the metric. By default, we use the target
+            provided when creating the report.
+
+        multioutput : {"raw_values", "uniform_average"} or array-like of shape \
+                (n_outputs,), default="raw_values"
+            Defines aggregating of multiple output values. Array-like value defines
+            weights used to average errors. The other possible values are:
+
+            - "raw_values": Returns a full set of errors in case of multioutput input.
+            - "uniform_average": Errors of all outputs are averaged with uniform weight.
+
+            By default, no averaging is done.
+
+        Returns
+        -------
+        pd.DataFrame
+            The root mean squared error.
+
+        Examples
+        --------
+        >>> from sklearn.datasets import load_diabetes
+        >>> from sklearn.linear_model import Ridge
+        >>> from sklearn.model_selection import train_test_split
+        >>> from skore import ComparisonReport, EstimatorReport
+        >>> X, y = load_diabetes(return_X_y=True)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+        >>> estimator_1 = Ridge(random_state=42)
+        >>> estimator_report_1 = EstimatorReport(
+        ...     estimator_1,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> estimator_2 = Ridge(random_state=43)
+        >>> estimator_report_2 = EstimatorReport(
+        ...     estimator_2,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> comparison_report = ComparisonReport(
+        ...     [estimator_report_1, estimator_report_2]
+        ... )
+        >>> comparison_report.metrics.rmse()
+        Estimator       Ridge       Ridge
+        Metric
+        RMSE (↘︎)    55.726...   55.726...
+        """
+        return self.report_metrics(
+            scoring=["rmse"],
+            data_source=data_source,
+            X=X,
+            y=y,
+            scoring_kwargs={"multioutput": multioutput},
+        )
+
+    def custom_metric(
+        self,
+        metric_function,
+        response_method,
+        *,
+        metric_name=None,
+        data_source="test",
+        X=None,
+        y=None,
+        **kwargs,
+    ):
+        """Compute a custom metric.
+
+        It brings some flexibility to compute any desired metric. However, we need to
+        follow some rules:
+
+        - `metric_function` should take `y_true` and `y_pred` as the first two
+          positional arguments.
+        - `response_method` corresponds to the estimator's method to be invoked to get
+          the predictions. It can be a string or a list of strings to defined in which
+          order the methods should be invoked.
+
+        Parameters
+        ----------
+        metric_function : callable
+            The metric function to be computed. The expected signature is
+            `metric_function(y_true, y_pred, **kwargs)`.
+
+        response_method : str or list of str
+            The estimator's method to be invoked to get the predictions. The possible
+            values are: `predict`, `predict_proba`, `predict_log_proba`, and
+            `decision_function`.
+
+        metric_name : str, default=None
+            The name of the metric. If not provided, it will be inferred from the
+            metric function.
+
+        data_source : {"test", "train"}, default="test"
+            The data source to use.
+
+            - "test" : use the test set provided when creating the report.
+            - "train" : use the train set provided when creating the report.
+            - "X_y" : use the provided `X` and `y` to compute the metric.
+
+        X : array-like of shape (n_samples, n_features), default=None
+            New data on which to compute the metric. By default, we use the validation
+            set provided when creating the report.
+
+        y : array-like of shape (n_samples,), default=None
+            New target on which to compute the metric. By default, we use the target
+            provided when creating the report.
+
+        **kwargs : dict
+            Any additional keyword arguments to be passed to the metric function.
+
+        Returns
+        -------
+        pd.DataFrame
+            The custom metric.
+
+        Examples
+        --------
+        >>> from sklearn.datasets import load_diabetes
+        >>> from sklearn.linear_model import Ridge
+        >>> from sklearn.metrics import mean_absolute_error
+        >>> from sklearn.model_selection import train_test_split
+        >>> from skore import ComparisonReport, EstimatorReport
+        >>> X, y = load_diabetes(return_X_y=True)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+        >>> estimator_1 = Ridge(random_state=42)
+        >>> estimator_report_1 = EstimatorReport(
+        ...     estimator_1,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> estimator_2 = Ridge(random_state=43)
+        >>> estimator_report_2 = EstimatorReport(
+        ...     estimator_2,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test,
+        ... )
+        >>> comparison_report = ComparisonReport(
+        ...     [estimator_report_1, estimator_report_2]
+        ... )
+        >>> comparison_report.metrics.custom_metric(
+        ...     metric_function=mean_absolute_error,
+        ...     response_method="predict",
+        ...     metric_name="MAE (↗︎)",
+        ... )
+        Estimator      Ridge      Ridge
+        Metric
+        MAE (↗︎)     45.91...   45.91...
+        """
+        # create a scorer with `greater_is_better=True` to not alter the output of
+        # `metric_function`
+        scorer = make_scorer(
+            metric_function,
+            greater_is_better=True,
+            response_method=response_method,
+            **kwargs,
+        )
+        return self.report_metrics(
+            scoring=[scorer],
+            data_source=data_source,
+            X=X,
+            y=y,
+            scoring_names=[metric_name],
+        )
+
+    ####################################################################################
+    # Methods related to the help tree
+    ####################################################################################
+
+    def _sort_methods_for_help(self, methods):
+        """Override sort method for metrics-specific ordering.
+
+        In short, we display the `report_metrics` first and then the `custom_metric`.
+        """
+
+        def _sort_key(method):
+            name = method[0]
+            if name == "custom_metric":
+                priority = 1
+            elif name == "report_metrics":
+                priority = 2
+            else:
+                priority = 0
+            return priority, name
+
+        return sorted(methods, key=_sort_key)
+
+    def _format_method_name(self, name):
+        """Override format method for metrics-specific naming."""
+        method_name = f"{name}(...)"
+        method_name = method_name.ljust(22)
+        if name in self._SCORE_OR_LOSS_INFO and self._SCORE_OR_LOSS_INFO[name][
+            "icon"
+        ] in ("(↗︎)", "(↘︎)"):
+            if self._SCORE_OR_LOSS_INFO[name]["icon"] == "(↗︎)":
+                method_name += f"[cyan]{self._SCORE_OR_LOSS_INFO[name]['icon']}[/cyan]"
+                return method_name.ljust(43)
+            else:  # (↘︎)
+                method_name += (
+                    f"[orange1]{self._SCORE_OR_LOSS_INFO[name]['icon']}[/orange1]"
+                )
+                return method_name.ljust(49)
+        else:
+            return method_name.ljust(29)
+
+    def _get_methods_for_help(self):
+        """Override to exclude the plot accessor from methods list."""
+        methods = super()._get_methods_for_help()
+        return [(name, method) for name, method in methods if name != "plot"]
+
+    def _get_help_panel_title(self):
+        return "[bold cyan]Available metrics methods[/bold cyan]"
+
+    def _get_help_legend(self):
+        return (
+            "[cyan](↗︎)[/cyan] higher is better [orange1](↘︎)[/orange1] lower is better"
+        )
+
+    def _get_help_tree_title(self):
+        return "[bold cyan]report.metrics[/bold cyan]"
+
+    def __repr__(self):
+        """Return a string representation using rich."""
+        return self._rich_repr(
+            class_name="skore.ComparisonReport.metrics",
+            help_method_name="report.metrics.help()",
+        )
diff --git a/skore/src/skore/sklearn/_comparison/report.py b/skore/src/skore/sklearn/_comparison/report.py
new file mode 100644
index 000000000..ac4c3a30f
--- /dev/null
+++ b/skore/src/skore/sklearn/_comparison/report.py
@@ -0,0 +1,170 @@
+from __future__ import annotations
+
+import time
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import joblib
+import numpy as np
+
+from skore.externals._pandas_accessors import DirNamesMixin
+from skore.sklearn._base import _BaseReport
+from skore.sklearn._estimator.report import EstimatorReport
+
+
+class ComparisonReport(_BaseReport, DirNamesMixin):
+    """Report for comparison of instances of :class:`skore.EstimatorReport`.
+
+    Caution: reports passed to `ComparisonReport` are not copied. If you pass
+    a report to `ComparisonReport`, and then modify the report outside later, it will
+    affect the report stored inside the `ComparisonReport` as well, which can lead to
+    inconsistent results. For this reason, modifying reports after creation is strongly
+    discouraged.
+
+    Parameters
+    ----------
+    reports : list of :class:`~skore.EstimatorReport` instances or dict
+        Estimator reports to compare.
+
+        * If `reports` is a list, the class name of each estimator is used.
+        * If `reports` is a dict, it is expected to have estimator names as keys
+          and :class:`~skore.EstimatorReport` instances as values.
+          If the keys are not strings, they will be converted to strings.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. Training the estimators and computing
+        the scores are parallelized.
+        When accessing some methods of the `ComparisonReport`, the `n_jobs`
+        parameter is used to parallelize the computation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors.
+
+    Attributes
+    ----------
+    estimator_reports_ : list of `~skore.EstimatorReport`
+        The compared estimator reports.
+
+    report_names_ : list of str
+        The names of the compared estimator reports.
+
+    See Also
+    --------
+    skore.EstimatorReport
+        Report for a fitted estimator.
+
+    skore.CrossValidationReport
+        Report for the cross-validation of an estimator.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from skore import ComparisonReport, EstimatorReport
+    >>> X, y = make_classification(random_state=42)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+    >>> estimator_1 = LogisticRegression()
+    >>> estimator_report_1 = EstimatorReport(
+    ...     estimator_1,
+    ...     X_train=X_train,
+    ...     y_train=y_train,
+    ...     X_test=X_test,
+    ...     y_test=y_test
+    ... )
+    >>> estimator_2 = LogisticRegression(C=2)  # Different regularization
+    >>> estimator_report_2 = EstimatorReport(
+    ...     estimator_2,
+    ...     X_train=X_train,
+    ...     y_train=y_train,
+    ...     X_test=X_test,
+    ...     y_test=y_test
+    ... )
+    >>> report = ComparisonReport([estimator_report_1, estimator_report_2])
+    ...
+    >>> report = ComparisonReport(
+    ...     {"model1": estimator_report_1, "model2": estimator_report_2}
+    ... )
+    ...
+    """
+
+    _ACCESSOR_CONFIG = {
+        "metrics": {"name": "metrics"},
+    }
+
+    def __init__(
+        self,
+        reports: Union[list[EstimatorReport], dict[str, EstimatorReport]],
+        *,
+        n_jobs: Optional[int] = None,
+    ):
+        """
+        ComparisonReport instance initializer.
+
+        Notes
+        -----
+        We check that the estimator reports can be compared:
+        - all reports are estimator reports,
+        - all estimators are in the same ML use case,
+        - all estimators have non-empty X_test and y_test,
+        - all estimators have the same X_test and y_test.
+        """
+        if not isinstance(reports, Iterable):
+            raise TypeError(f"Expected reports to be an iterable; got {type(reports)}")
+
+        if len(reports) < 2:
+            raise ValueError("At least 2 instances of EstimatorReport are needed")
+
+        report_names = (
+            list(map(str, reports.keys())) if isinstance(reports, dict) else None
+        )
+        reports = list(reports.values()) if isinstance(reports, dict) else reports
+
+        if not all(isinstance(report, EstimatorReport) for report in reports):
+            raise TypeError("Expected instances of EstimatorReport")
+
+        test_dataset_hashes = {
+            joblib.hash((report.X_test, report.y_test))
+            for report in reports
+            if not ((report.X_test is None) and (report.y_test is None))
+        }
+        if len(test_dataset_hashes) > 1:
+            raise ValueError("Expected all estimators to have the same testing data.")
+
+        ml_tasks = {report: report._ml_task for report in reports}
+        if len(set(ml_tasks.values())) > 1:
+            raise ValueError(
+                f"Expected all estimators to have the same ML usecase; "
+                f"got {ml_tasks}"
+            )
+
+        if report_names is None:
+            self.report_names_ = [report.estimator_name_ for report in reports]
+        else:
+            self.report_names_ = report_names
+
+        self.estimator_reports_ = reports
+
+        # NEEDED FOR METRICS ACCESSOR
+        self.n_jobs = n_jobs
+        self._rng = np.random.default_rng(time.time_ns())
+        self._hash = self._rng.integers(
+            low=np.iinfo(np.int64).min, high=np.iinfo(np.int64).max
+        )
+        self._cache = {}
+        self._ml_task = self.estimator_reports_[0]._ml_task
+
+    ####################################################################################
+    # Methods related to the help and repr
+    ####################################################################################
+
+    def _get_help_panel_title(self):
+        return "[bold cyan]Tools to compare estimators[/bold cyan]"
+
+    def _get_help_legend(self):
+        return (
+            "[cyan](↗︎)[/cyan] higher is better [orange1](↘︎)[/orange1] lower is better"
+        )
+
+    def __repr__(self):
+        """Return a string representation."""
+        return f"{self.__class__.__name__}(...)"
diff --git a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py
index 78807b280..87582939b 100644
--- a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py
+++ b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py
@@ -137,16 +137,15 @@ def _compute_metric_scores(
         cache_key = (self._parent._hash, report_metric_name, data_source)
         cache_key += (aggregate,) if aggregate is None else tuple(aggregate)
 
-        if metric_kwargs != {}:
-            # we need to enforce the order of the parameter for a specific metric
-            # to make sure that we hit the cache in a consistent way
-            ordered_metric_kwargs = sorted(metric_kwargs.keys())
-
-            for key in ordered_metric_kwargs:
-                if isinstance(metric_kwargs[key], (np.ndarray, list, dict)):
-                    cache_key += (joblib.hash(metric_kwargs[key]),)
-                else:
-                    cache_key += (metric_kwargs[key],)
+        # we need to enforce the order of the parameter for a specific metric
+        # to make sure that we hit the cache in a consistent way
+        ordered_metric_kwargs = sorted(metric_kwargs.keys())
+
+        for key in ordered_metric_kwargs:
+            if isinstance(metric_kwargs[key], (np.ndarray, list, dict)):
+                cache_key += (joblib.hash(metric_kwargs[key]),)
+            else:
+                cache_key += (metric_kwargs[key],)
 
         progress = self._progress_info["current_progress"]
         main_task = self._progress_info["current_task"]
diff --git a/skore/src/skore/sklearn/_cross_validation/report.py b/skore/src/skore/sklearn/_cross_validation/report.py
index a767e2104..978a4a96b 100644
--- a/skore/src/skore/sklearn/_cross_validation/report.py
+++ b/skore/src/skore/sklearn/_cross_validation/report.py
@@ -84,13 +84,12 @@ class CrossValidationReport(_BaseReport, DirNamesMixin):
 
     See Also
     --------
-    skore.sklearn.estimator.report.EstimatorReport :
+    skore.EstimatorReport
         Report for a fitted estimator.
 
     Examples
     --------
     >>> from sklearn.datasets import make_classification
-    >>> from sklearn.model_selection import train_test_split
     >>> from sklearn.linear_model import LogisticRegression
     >>> X, y = make_classification(random_state=42)
     >>> estimator = LogisticRegression()
diff --git a/skore/src/skore/sklearn/_estimator/metrics_accessor.py b/skore/src/skore/sklearn/_estimator/metrics_accessor.py
index d39f9157e..546d67d6f 100644
--- a/skore/src/skore/sklearn/_estimator/metrics_accessor.py
+++ b/skore/src/skore/sklearn/_estimator/metrics_accessor.py
@@ -363,18 +363,18 @@ def _compute_metric_scores(
         metric_params = inspect.signature(metric_fn).parameters
         if "pos_label" in metric_params:
             cache_key += (pos_label,)
-        if metric_kwargs != {}:
-            # we need to enforce the order of the parameter for a specific metric
-            # to make sure that we hit the cache in a consistent way
-            ordered_metric_kwargs = sorted(metric_kwargs.keys())
-            cache_key += tuple(
-                (
-                    joblib.hash(metric_kwargs[key])
-                    if isinstance(metric_kwargs[key], np.ndarray)
-                    else metric_kwargs[key]
-                )
-                for key in ordered_metric_kwargs
+
+        # we need to enforce the order of the parameter for a specific metric
+        # to make sure that we hit the cache in a consistent way
+        ordered_metric_kwargs = sorted(metric_kwargs.keys())
+        cache_key += tuple(
+            (
+                joblib.hash(metric_kwargs[key])
+                if isinstance(metric_kwargs[key], np.ndarray)
+                else metric_kwargs[key]
             )
+            for key in ordered_metric_kwargs
+        )
 
         if cache_key in self._parent._cache:
             score = self._parent._cache[cache_key]
diff --git a/skore/tests/unit/sklearn/test_comparison.py b/skore/tests/unit/sklearn/test_comparison.py
new file mode 100644
index 000000000..622d06fd4
--- /dev/null
+++ b/skore/tests/unit/sklearn/test_comparison.py
@@ -0,0 +1,536 @@
+import re
+from io import BytesIO
+
+import joblib
+import pandas as pd
+import pytest
+from sklearn.datasets import make_classification
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.model_selection import train_test_split
+from skore import ComparisonReport, EstimatorReport
+
+
+@pytest.fixture
+def binary_classification_model():
+    """Create a binary classification dataset and return fitted estimator and data."""
+    X, y = make_classification(random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+
+    return LogisticRegression(random_state=42), X_train, X_test, y_train, y_test
+
+
+@pytest.fixture
+def regression_model():
+    """Create a binary classification dataset and return fitted estimator and data."""
+    X, y = make_classification(random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+
+    return LinearRegression(), X_train, X_test, y_train, y_test
+
+
+def test_comparison_report_init_wrong_parameters(binary_classification_model):
+    """If the input is not valid, raise."""
+
+    estimator, _, X_test, _, y_test = binary_classification_model
+    estimator_report = EstimatorReport(
+        estimator,
+        fit=False,
+        X_test=X_test,
+        y_test=y_test,
+    )
+
+    with pytest.raises(TypeError, match="Expected reports to be an iterable"):
+        ComparisonReport(estimator_report)
+
+    with pytest.raises(
+        ValueError, match="At least 2 instances of EstimatorReport are needed"
+    ):
+        ComparisonReport([estimator_report])
+
+    with pytest.raises(TypeError, match="Expected instances of EstimatorReport"):
+        ComparisonReport([None, estimator_report])
+
+
+def test_comparison_report_without_testing_data(binary_classification_model):
+    """If there is no test data (`None`) for some estimator report,
+    initialization works, but computing metrics can fail.
+    """
+    estimator, _, _, _, _ = binary_classification_model
+    estimator_report = EstimatorReport(estimator, fit=False)
+
+    report = ComparisonReport([estimator_report, estimator_report])
+
+    with pytest.raises(ValueError, match="No test data"):
+        report.metrics.report_metrics(data_source="test")
+
+
+def test_comparison_report_different_test_data(binary_classification_model):
+    """Raise an error if the passed estimators do not have the same testing data."""
+    estimator, X_train, X_test, y_train, y_test = binary_classification_model
+    estimator.fit(X_train, y_train)
+
+    # The estimators that have testing data, need to have the same testing data
+    # The estimators that do not have testing data do not count
+    with pytest.raises(
+        ValueError, match="Expected all estimators to have the same testing data"
+    ):
+        ComparisonReport(
+            [
+                EstimatorReport(estimator, X_test=X_test, y_test=y_test),
+                EstimatorReport(estimator, X_test=X_test[1:], y_test=y_test[1:]),
+            ]
+        )
+
+    # The estimators without testing data (i.e. no X_test and no y_test) do not count
+    ComparisonReport(
+        [
+            EstimatorReport(estimator, X_test=X_test, y_test=y_test),
+            EstimatorReport(estimator, X_test=X_test, y_test=y_test),
+            EstimatorReport(estimator),
+        ]
+    )
+
+    # If there is an X_test but no y_test, it counts
+    with pytest.raises(
+        ValueError, match="Expected all estimators to have the same testing data"
+    ):
+        ComparisonReport(
+            [
+                EstimatorReport(estimator, fit=False, X_test=X_test, y_test=y_test),
+                EstimatorReport(estimator, fit=False, X_test=X_test),
+            ]
+        )
+
+
+def test_comparison_report_init_different_ml_usecases(
+    binary_classification_model, regression_model
+):
+    """Raise an error if the passed estimators do not have the same ML usecase."""
+    linear_regression_estimator, _, X_test, _, y_test = regression_model
+    linear_regression_report = EstimatorReport(
+        linear_regression_estimator,
+        fit=False,
+        X_test=X_test,
+        y_test=y_test,
+    )
+
+    logistic_regression_estimator, _, X_test, _, y_test = binary_classification_model
+    logistic_regression_report = EstimatorReport(
+        logistic_regression_estimator,
+        fit=False,
+        X_test=X_test,
+        y_test=y_test,
+    )
+
+    with pytest.raises(
+        ValueError, match="Expected all estimators to have the same ML usecase"
+    ):
+        ComparisonReport([linear_regression_report, logistic_regression_report])
+
+
+def test_comparison_report_init_with_report_names(binary_classification_model):
+    """If the estimators are passed as a dict,
+    then the estimator names are the dict keys."""
+    estimator, X_train, X_test, y_train, y_test = binary_classification_model
+    estimator_report = EstimatorReport(
+        estimator,
+        X_train=X_train,
+        y_train=y_train,
+        X_test=X_test,
+        y_test=y_test,
+    )
+
+    comp = ComparisonReport({"r1": estimator_report, "r2": estimator_report})
+
+    pd.testing.assert_index_equal(
+        comp.metrics.accuracy().columns,
+        pd.Index(["r1", "r2"], name="Estimator"),
+    )
+
+
+def test_comparison_report_init_without_report_names(binary_classification_model):
+    """If the estimators are passed as a list,
+    then the estimator names are the estimator class names."""
+    estimator, X_train, X_test, y_train, y_test = binary_classification_model
+    estimator_report = EstimatorReport(
+        estimator,
+        X_train=X_train,
+        y_train=y_train,
+        X_test=X_test,
+        y_test=y_test,
+    )
+
+    comp = ComparisonReport([estimator_report, estimator_report])
+
+    pd.testing.assert_index_equal(
+        comp.metrics.accuracy().columns,
+        pd.Index(["LogisticRegression", "LogisticRegression"], name="Estimator"),
+    )
+
+
+def test_comparison_report_non_string_report_names(binary_classification_model):
+    """If the estimators are passed as a dict with non-string keys,
+    then the estimator names are the dict keys converted to strings."""
+    estimator, _, X_test, _, y_test = binary_classification_model
+    estimator_report = EstimatorReport(
+        estimator,
+        fit=False,
+        X_test=X_test,
+        y_test=y_test,
+    )
+
+    report = ComparisonReport({0: estimator_report, "1": estimator_report})
+    assert report.report_names_ == ["0", "1"]
+
+
+def test_comparison_report_help(capsys, binary_classification_model):
+    """Check the help menu works."""
+    estimator, _, X_test, _, y_test = binary_classification_model
+    estimator_report = EstimatorReport(
+        estimator,
+        fit=False,
+        X_test=X_test,
+        y_test=y_test,
+    )
+
+    ComparisonReport([estimator_report, estimator_report]).help()
+
+    captured = capsys.readouterr()
+    assert "Tools to compare estimators" in captured.out
+
+    # Check that we have a line with accuracy and the arrow associated with it
+    assert re.search(
+        r"\.accuracy\([^)]*\).*\(↗︎\).*-.*accuracy", captured.out, re.MULTILINE
+    )
+
+
+def test_comparison_report_repr(binary_classification_model):
+    """Check the `__repr__` works."""
+    estimator, _, X_test, _, y_test = binary_classification_model
+    estimator_report = EstimatorReport(
+        estimator,
+        fit=False,
+        X_test=X_test,
+        y_test=y_test,
+    )
+
+    repr_str = repr(ComparisonReport([estimator_report, estimator_report]))
+
+    assert "ComparisonReport" in repr_str
+
+
+def test_comparison_report_pickle(tmp_path, binary_classification_model):
+    """Check that we can pickle a comparison report."""
+    estimator, _, X_test, _, y_test = binary_classification_model
+    estimator_report = EstimatorReport(
+        estimator,
+        fit=False,
+        X_test=X_test,
+        y_test=y_test,
+    )
+
+    with BytesIO() as stream:
+        joblib.dump(ComparisonReport([estimator_report, estimator_report]), stream)
+
+
+def test_comparison_report_metrics_help(capsys, binary_classification_model):
+    """Check that the help method writes to the console."""
+    estimator, _, X_test, _, y_test = binary_classification_model
+    estimator_report = EstimatorReport(
+        estimator,
+        fit=False,
+        X_test=X_test,
+        y_test=y_test,
+    )
+    report = ComparisonReport([estimator_report, estimator_report])
+
+    report.metrics.help()
+    captured = capsys.readouterr()
+    assert "Available metrics methods" in captured.out
+
+
+def test_comparison_report_metrics_repr(binary_classification_model):
+    """Check the repr method."""
+    estimator, _, X_test, _, y_test = binary_classification_model
+    estimator_report = EstimatorReport(
+        estimator,
+        fit=False,
+        X_test=X_test,
+        y_test=y_test,
+    )
+    report = ComparisonReport([estimator_report, estimator_report])
+
+    repr_str = repr(report.metrics)
+    assert "skore.ComparisonReport.metrics" in repr_str
+    assert "report.metrics.help()" in repr_str
+
+
+@pytest.mark.parametrize("data_source", ["test", "X_y"])
+@pytest.mark.parametrize(
+    "metric_name, expected",
+    [
+        (
+            "accuracy",
+            pd.DataFrame(
+                [[1.0, 1.0]],
+                columns=pd.Index(
+                    ["LogisticRegression", "LogisticRegression"],
+                    name="Estimator",
+                ),
+                index=pd.Index(["Accuracy (↗︎)"], dtype="object", name="Metric"),
+            ),
+        ),
+        (
+            "precision",
+            pd.DataFrame(
+                [[1.0, 1.0], [1.0, 1.0]],
+                columns=pd.Index(
+                    ["LogisticRegression", "LogisticRegression"],
+                    name="Estimator",
+                ),
+                index=pd.MultiIndex.from_tuples(
+                    [("Precision (↗︎)", 0), ("Precision (↗︎)", 1)],
+                    names=["Metric", "Label / Average"],
+                ),
+            ),
+        ),
+        (
+            "recall",
+            pd.DataFrame(
+                [[1.0, 1.0], [1.0, 1.0]],
+                columns=pd.Index(
+                    ["LogisticRegression", "LogisticRegression"],
+                    name="Estimator",
+                ),
+                index=pd.MultiIndex.from_tuples(
+                    [("Recall (↗︎)", 0), ("Recall (↗︎)", 1)],
+                    names=["Metric", "Label / Average"],
+                ),
+            ),
+        ),
+        (
+            "brier_score",
+            pd.DataFrame(
+                [[0.026684, 0.026684]],
+                columns=pd.Index(
+                    ["LogisticRegression", "LogisticRegression"],
+                    name="Estimator",
+                ),
+                index=pd.Index(["Brier score (↘︎)"], dtype="object", name="Metric"),
+            ),
+        ),
+        (
+            "roc_auc",
+            pd.DataFrame(
+                [[1.0, 1.0]],
+                columns=pd.Index(
+                    ["LogisticRegression", "LogisticRegression"],
+                    name="Estimator",
+                ),
+                index=pd.Index(["ROC AUC (↗︎)"], dtype="object", name="Metric"),
+            ),
+        ),
+        (
+            "log_loss",
+            pd.DataFrame(
+                [[0.113233, 0.113233]],
+                columns=pd.Index(
+                    ["LogisticRegression", "LogisticRegression"],
+                    name="Estimator",
+                ),
+                index=pd.Index(["Log loss (↘︎)"], dtype="object", name="Metric"),
+            ),
+        ),
+    ],
+)
+def test_comparison_report_metrics_binary_classification(
+    metric_name, expected, data_source, binary_classification_model
+):
+    """Check the metrics work."""
+    estimator, X_train, X_test, y_train, y_test = binary_classification_model
+    estimator_report = EstimatorReport(
+        estimator,
+        X_train=X_train,
+        y_train=y_train,
+        X_test=X_test,
+        y_test=y_test,
+    )
+
+    comp = ComparisonReport([estimator_report, estimator_report])
+
+    # ensure metric is valid
+    if data_source == "X_y":
+        result = getattr(comp.metrics, metric_name)(
+            data_source=data_source, X=X_test, y=y_test
+        )
+    else:
+        result = getattr(comp.metrics, metric_name)(data_source=data_source)
+    pd.testing.assert_frame_equal(result, expected)
+
+    # ensure metric is valid even from the cache
+    if data_source == "X_y":
+        result = getattr(comp.metrics, metric_name)(
+            data_source=data_source, X=X_test, y=y_test
+        )
+    else:
+        result = getattr(comp.metrics, metric_name)(data_source=data_source)
+    pd.testing.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("data_source", ["test", "X_y"])
+@pytest.mark.parametrize(
+    "metric_name, expected",
+    [
+        (
+            "rmse",
+            pd.DataFrame(
+                [[0.27699, 0.27699]],
+                columns=pd.Index(
+                    ["LinearRegression", "LinearRegression"],
+                    name="Estimator",
+                ),
+                index=pd.Index(["RMSE (↘︎)"], dtype="object", name="Metric"),
+            ),
+        ),
+        (
+            "r2",
+            pd.DataFrame(
+                [[0.680319, 0.680319]],
+                columns=pd.Index(
+                    ["LinearRegression", "LinearRegression"],
+                    name="Estimator",
+                ),
+                index=pd.Index(["R² (↗︎)"], dtype="object", name="Metric"),
+            ),
+        ),
+    ],
+)
+def test_comparison_report_metrics_linear_regression(
+    metric_name, expected, data_source, regression_model
+):
+    """Check the metrics work."""
+    estimator, X_train, X_test, y_train, y_test = regression_model
+    estimator_report = EstimatorReport(
+        estimator,
+        X_train=X_train,
+        y_train=y_train,
+        X_test=X_test,
+        y_test=y_test,
+    )
+
+    comp = ComparisonReport([estimator_report, estimator_report])
+
+    # ensure metric is valid
+    if data_source == "X_y":
+        result = getattr(comp.metrics, metric_name)(
+            data_source=data_source, X=X_test, y=y_test
+        )
+    else:
+        result = getattr(comp.metrics, metric_name)()
+    pd.testing.assert_frame_equal(result, expected)
+
+    # ensure metric is valid even from the cache
+    if data_source == "X_y":
+        result = getattr(comp.metrics, metric_name)(
+            data_source=data_source, X=X_test, y=y_test
+        )
+    else:
+        result = getattr(comp.metrics, metric_name)()
+    pd.testing.assert_frame_equal(result, expected)
+
+
+def test_comparison_report_report_metrics_X_y(binary_classification_model):
+    """Check that `report_metrics` works with an "X_y" data source."""
+    estimator, X_train, X_test, y_train, y_test = binary_classification_model
+    estimator_report = EstimatorReport(
+        estimator,
+        X_train=X_train,
+        y_train=y_train,
+        X_test=X_test,
+        y_test=y_test,
+    )
+
+    comp = ComparisonReport([estimator_report, estimator_report])
+
+    result = comp.metrics.report_metrics(
+        data_source="X_y",
+        X=X_train[:10],
+        y=y_train[:10],
+    )
+
+    expected = pd.DataFrame(
+        [
+            [1.0, 1.0],
+            [1.0, 1.0],
+            [1.0, 1.0],
+            [1.0, 1.0],
+            [1.0, 1.0],
+            [0.01514976, 0.01514976],
+        ],
+        columns=pd.Index(
+            ["LogisticRegression", "LogisticRegression"],
+            name="Estimator",
+        ),
+        index=pd.MultiIndex.from_tuples(
+            [
+                ("Precision (↗︎)", 0),
+                ("Precision (↗︎)", 1),
+                ("Recall (↗︎)", 0),
+                ("Recall (↗︎)", 1),
+                ("ROC AUC (↗︎)", ""),
+                ("Brier score (↘︎)", ""),
+            ],
+            names=["Metric", "Label / Average"],
+        ),
+    )
+    pd.testing.assert_frame_equal(result, expected)
+
+    assert len(comp._cache) == 1
+    cached_result = list(comp._cache.values())[0]
+    pd.testing.assert_frame_equal(cached_result, expected)
+
+
+def test_comparison_report_custom_metric_X_y(binary_classification_model):
+    """Check that `custom_metric` works with an "X_y" data source."""
+    from sklearn.metrics import mean_absolute_error
+
+    estimator, X_train, X_test, y_train, y_test = binary_classification_model
+    estimator_report = EstimatorReport(
+        estimator,
+        X_train=X_train,
+        y_train=y_train,
+        X_test=X_test,
+        y_test=y_test,
+    )
+
+    comp = ComparisonReport([estimator_report, estimator_report])
+
+    expected = pd.DataFrame(
+        [[0.0, 0.0]],
+        columns=pd.Index(
+            ["LogisticRegression", "LogisticRegression"], name="Estimator"
+        ),
+        index=pd.Index(["MAE (↗︎)"], name="Metric"),
+    )
+
+    # ensure metric is valid
+    result = comp.metrics.custom_metric(
+        metric_function=mean_absolute_error,
+        response_method="predict",
+        metric_name="MAE (↗︎)",
+        data_source="X_y",
+        X=X_test,
+        y=y_test,
+    )
+    pd.testing.assert_frame_equal(result, expected)
+
+    # ensure metric is valid even from the cache
+    result = comp.metrics.custom_metric(
+        metric_function=mean_absolute_error,
+        response_method="predict",
+        metric_name="MAE (↗︎)",
+        data_source="X_y",
+        X=X_test,
+        y=y_test,
+    )
+    pd.testing.assert_frame_equal(result, expected)
diff --git a/skore/tests/unit/sklearn/test_estimator.py b/skore/tests/unit/sklearn/test_estimator.py
index 4fc422b40..35877b7e3 100644
--- a/skore/tests/unit/sklearn/test_estimator.py
+++ b/skore/tests/unit/sklearn/test_estimator.py
@@ -1,5 +1,6 @@
 import re
 from copy import deepcopy
+from io import BytesIO
 from numbers import Real
 
 import joblib
@@ -335,7 +336,7 @@ def test_estimator_report_cache_predictions(
     assert report._cache.keys() == stored_cache.keys()
 
 
-def test_estimator_report_pickle(tmp_path, binary_classification_data):
+def test_estimator_report_pickle(binary_classification_data):
     """Check that we can pickle an estimator report.
 
     In particular, the progress bar from rich are pickable, therefore we trigger
@@ -344,7 +345,9 @@ def test_estimator_report_pickle(tmp_path, binary_classification_data):
     estimator, X_test, y_test = binary_classification_data
     report = EstimatorReport(estimator, X_test=X_test, y_test=y_test)
     report.cache_predictions()
-    joblib.dump(report, tmp_path / "report.joblib")
+
+    with BytesIO() as stream:
+        joblib.dump(report, stream)
 
 
 def test_estimator_report_flat_index(binary_classification_data):
diff --git a/sphinx/api/skore.config_context.rst b/sphinx/api/skore.config_context.rst
new file mode 100644
index 000000000..b64a9f9d9
--- /dev/null
+++ b/sphinx/api/skore.config_context.rst
@@ -0,0 +1,10 @@
+﻿config\_context
+===============
+
+.. currentmodule:: skore
+
+.. autofunction:: config_context
+
+.. minigallery:: skore.config_context
+   :add-heading: Gallery examples
+   :heading-level: -
\ No newline at end of file
diff --git a/sphinx/api/skore.get_config.rst b/sphinx/api/skore.get_config.rst
new file mode 100644
index 000000000..0f76d783f
--- /dev/null
+++ b/sphinx/api/skore.get_config.rst
@@ -0,0 +1,10 @@
+﻿get\_config
+===========
+
+.. currentmodule:: skore
+
+.. autofunction:: get_config
+
+.. minigallery:: skore.get_config
+   :add-heading: Gallery examples
+   :heading-level: -
\ No newline at end of file
diff --git a/sphinx/api/skore.set_config.rst b/sphinx/api/skore.set_config.rst
new file mode 100644
index 000000000..de1045f07
--- /dev/null
+++ b/sphinx/api/skore.set_config.rst
@@ -0,0 +1,10 @@
+﻿set\_config
+===========
+
+.. currentmodule:: skore
+
+.. autofunction:: set_config
+
+.. minigallery:: skore.set_config
+   :add-heading: Gallery examples
+   :heading-level: -
\ No newline at end of file
diff --git a/sphinx/index.rst b/sphinx/index.rst
index d09f653ec..56a592a80 100644
--- a/sphinx/index.rst
+++ b/sphinx/index.rst
@@ -41,9 +41,11 @@ Key features
       All these are computed and generated for you in 1 line of code.
       Under the hood, we use efficient caching to make the computations blazing fast.
 
-   -  :class:`skore.CrossValidationReport`: Get a skore estimator report for each fold
+   -  :class:`skore.CrossValidationReport`: get a skore estimator report for each fold
       of your cross-validation.
 
+   -  :class:`skore.ComparisonReport`: benchmark your skore estimator reports.
+
 What's next?
 """"""""""""
 
diff --git a/sphinx/reference/report/comparison_report.rst b/sphinx/reference/report/comparison_report.rst
new file mode 100644
index 000000000..7158cc5e1
--- /dev/null
+++ b/sphinx/reference/report/comparison_report.rst
@@ -0,0 +1,50 @@
+Report for a comparison of :class:`EstimatorReport`
+===================================================
+
+.. currentmodule:: skore
+
+The class :class:`ComparisonReport` provides a report allowing to compare :class:`EstimatorReport` instances in an interactive way. The functionalities of the report are accessible through accessors.
+
+.. autosummary::
+    :toctree: ../api/
+    :template: base.rst
+
+    ComparisonReport
+
+.. autosummary::
+    :toctree: ../api/
+    :nosignatures:
+    :template: autosummary/accessor_method.rst
+
+    ComparisonReport.help
+
+.. autosummary::
+    :toctree: ../api/
+    :nosignatures:
+    :template: autosummary/accessor.rst
+
+    ComparisonReport.metrics
+
+Metrics
+-------
+
+The `metrics` accessor helps you to evaluate the statistical performance of the
+compared estimators. In addition, we provide a sub-accessor `plot`, to
+get the common performance metric representations.
+
+.. autosummary::
+    :toctree: ../api/
+    :nosignatures:
+    :template: autosummary/accessor_method.rst
+
+    ComparisonReport.metrics.help
+    ComparisonReport.metrics.report_metrics
+    ComparisonReport.metrics.custom_metric
+    ComparisonReport.metrics.accuracy
+    ComparisonReport.metrics.brier_score
+    ComparisonReport.metrics.log_loss
+    ComparisonReport.metrics.precision
+    ComparisonReport.metrics.r2
+    ComparisonReport.metrics.recall
+    ComparisonReport.metrics.rmse
+    ComparisonReport.metrics.roc_auc
diff --git a/sphinx/reference/report/index.rst b/sphinx/reference/report/index.rst
index 2a87d824e..17fb6330a 100644
--- a/sphinx/reference/report/index.rst
+++ b/sphinx/reference/report/index.rst
@@ -42,6 +42,18 @@ scikit-learn estimators by cross-validation, and reporting the results.
 
    cross_validation_report
 
+Comparison Report
+-----------------------
+
+:class:`skore.ComparisonReport` provides comprehensive capabilities for comparing
+:class:`skore.EstimatorReport` instances, and reporting the results.
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   comparison_report
+
 Visualization Displays
 ----------------------
 

From a763269f2a35123078abd4b6da69a2f56d684a1b Mon Sep 17 00:00:00 2001
From: Auguste Baum <auguste@probabl.ai>
Date: Wed, 19 Feb 2025 16:22:06 +0100
Subject: [PATCH 4/9] feat(project): Add parameter to `Project.clear` to delete
 project (#1322)

`project.clear(delete_project=True)` now deletes the entire project,
while `project.clear(delete_project=False)` (the default) removes every
item from the project.

Closes #1294
---
 skore/src/skore/project/project.py       | 60 +++++++++++++++++++++++-
 skore/tests/conftest.py                  |  1 +
 skore/tests/unit/project/test_project.py | 16 +++++++
 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/skore/src/skore/project/project.py b/skore/src/skore/project/project.py
index 4ae6b2445..ebd731e30 100644
--- a/skore/src/skore/project/project.py
+++ b/skore/src/skore/project/project.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+import functools
+import shutil
 from collections.abc import Iterator
 from logging import INFO, NullHandler, getLogger
 from pathlib import Path
@@ -16,6 +18,30 @@
 logger.setLevel(INFO)
 
 
+class ProjectDeletedError(Exception):
+    """A method of a Project was called but the Project is marked as deleted."""
+
+
+def _raise_if_deleted(method):
+    """Raise if the underlying Project has been deleted, otherwise execute `method`.
+
+    This wrapper makes it safe to "delete" a Project, even if the Project instance
+    still exists.
+    """
+
+    @functools.wraps(method)
+    def wrapper(self, *args, **kwargs):
+        if self._storage_initialized is not True:
+            raise ProjectDeletedError(
+                "This Project instance is marked as deleted. "
+                "Please re-create a Project and discard the current one."
+            )
+
+        return method(self, *args, **kwargs)
+
+    return wrapper
+
+
 class Project:
     """
     A collection of items persisted in a storage.
@@ -95,16 +121,38 @@ def __init__(
         # Initialize repositories with dedicated storages
         self._item_repository = ItemRepository(DiskCacheStorage(item_storage_dirpath))
 
+        self._storage_initialized = True
+
         # Check if the project should rejoin a server
         from skore.project._launch import ServerInfo  # avoid circular import
 
         self._server_info = ServerInfo.rejoin(self)
 
-    def clear(self):
-        """Clear the project."""
+    @_raise_if_deleted
+    def clear(self, delete_project=False):
+        """Remove all items from the project.
+
+        .. warning::
+           Clearing the project with `delete_project=True` will invalidate the whole
+           `Project` instance, making it unusable.
+           A new Project instance can be created using the :class:`skore.Project`
+           constructor or the :func:`skore.open` function.
+
+        Parameters
+        ----------
+        delete_project : bool
+            If set, the project will be deleted entirely.
+        """
+        if delete_project:
+            self._storage_initialized = False
+            del self._item_repository
+            shutil.rmtree(self.path)
+            return
+
         for item_key in self._item_repository:
             self._item_repository.delete_item(item_key)
 
+    @_raise_if_deleted
     def put(
         self,
         key: str,
@@ -150,6 +198,7 @@ def put(
             ),
         )
 
+    @_raise_if_deleted
     def get(
         self,
         key: str,
@@ -211,6 +260,7 @@ def dto(item):
 
         raise ValueError('`version` should be -1, "all", or an integer')
 
+    @_raise_if_deleted
     def keys(self) -> list[str]:
         """
         Get all keys of items stored in the project.
@@ -222,6 +272,7 @@ def keys(self) -> list[str]:
         """
         return self._item_repository.keys()
 
+    @_raise_if_deleted
     def __iter__(self) -> Iterator[str]:
         """
         Yield the keys of items stored in the project.
@@ -233,6 +284,7 @@ def __iter__(self) -> Iterator[str]:
         """
         yield from self._item_repository
 
+    @_raise_if_deleted
     def delete(self, key: str):
         """Delete the item corresponding to ``key`` from the Project.
 
@@ -248,6 +300,7 @@ def delete(self, key: str):
         """
         self._item_repository.delete_item(key)
 
+    @_raise_if_deleted
     def set_note(self, key: str, note: str, *, version=-1):
         """Attach a note to key ``key``.
 
@@ -277,6 +330,7 @@ def set_note(self, key: str, note: str, *, version=-1):
         """
         return self._item_repository.set_item_note(key=key, note=note, version=version)
 
+    @_raise_if_deleted
     def get_note(self, key: str, *, version=-1) -> Union[str, None]:
         """Retrieve a note previously attached to key ``key``.
 
@@ -306,6 +360,7 @@ def get_note(self, key: str, *, version=-1) -> Union[str, None]:
         """
         return self._item_repository.get_item_note(key=key, version=version)
 
+    @_raise_if_deleted
     def delete_note(self, key: str, *, version=-1):
         """Delete a note previously attached to key ``key``.
 
@@ -333,6 +388,7 @@ def delete_note(self, key: str, *, version=-1):
         """
         return self._item_repository.delete_item_note(key=key, version=version)
 
+    @_raise_if_deleted
     def shutdown_web_ui(self):
         """Shutdown the web UI server if it is running."""
         if self._server_info is None:
diff --git a/skore/tests/conftest.py b/skore/tests/conftest.py
index bfcce2619..a7e00c375 100644
--- a/skore/tests/conftest.py
+++ b/skore/tests/conftest.py
@@ -43,6 +43,7 @@ def in_memory_project(monkeypatch):
     project.path = None
     project.name = "test"
     project._item_repository = ItemRepository(storage=InMemoryStorage())
+    project._storage_initialized = True
 
     return project
 
diff --git a/skore/tests/unit/project/test_project.py b/skore/tests/unit/project/test_project.py
index 145a642a2..c5c420259 100644
--- a/skore/tests/unit/project/test_project.py
+++ b/skore/tests/unit/project/test_project.py
@@ -12,6 +12,7 @@
 from PIL import Image
 from sklearn.ensemble import RandomForestClassifier
 from skore import Project
+from skore.project.project import ProjectDeletedError
 
 
 @pytest.fixture(autouse=True)
@@ -49,6 +50,21 @@ def test_clear(tmp_path):
     assert project.keys() == []
     assert project._item_repository.keys() == []
 
+    assert dirpath.exists()
+
+
+def test_clear_delete_project(tmp_path):
+    dirpath = tmp_path / "my-project.skore"
+    project = Project(dirpath)
+
+    project.clear(delete_project=True)
+    assert not dirpath.exists()
+
+    with pytest.raises(
+        ProjectDeletedError, match="This Project instance is marked as deleted"
+    ):
+        project.keys()
+
 
 def test_put_string_item(in_memory_project):
     in_memory_project.put("string_item", "Hello, World!")

From 5150886230bfbd25551795eff5b18aac3b6a14ac Mon Sep 17 00:00:00 2001
From: Auguste Baum <auguste@probabl.ai>
Date: Thu, 20 Feb 2025 09:12:24 +0100
Subject: [PATCH 5/9] feat: Add cache_predictions method to ComparisonReport
 (#1352)

Closes #1346
---
 skore/src/skore/sklearn/_comparison/report.py | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/skore/src/skore/sklearn/_comparison/report.py b/skore/src/skore/sklearn/_comparison/report.py
index ac4c3a30f..68dd58803 100644
--- a/skore/src/skore/sklearn/_comparison/report.py
+++ b/skore/src/skore/sklearn/_comparison/report.py
@@ -10,6 +10,7 @@
 from skore.externals._pandas_accessors import DirNamesMixin
 from skore.sklearn._base import _BaseReport
 from skore.sklearn._estimator.report import EstimatorReport
+from skore.utils._progress_bar import progress_decorator
 
 
 class ComparisonReport(_BaseReport, DirNamesMixin):
@@ -144,6 +145,9 @@ def __init__(
 
         self.estimator_reports_ = reports
 
+        # used to know if a parent launches a progress bar manager
+        self._parent_progress = None
+
         # NEEDED FOR METRICS ACCESSOR
         self.n_jobs = n_jobs
         self._rng = np.random.default_rng(time.time_ns())
@@ -153,6 +157,103 @@ def __init__(
         self._cache = {}
         self._ml_task = self.estimator_reports_[0]._ml_task
 
+    def clear_cache(self):
+        """Clear the cache.
+
+        Examples
+        --------
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.model_selection import train_test_split
+        >>> from skore import ComparisonReport
+        >>> X, y = make_classification(random_state=42)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+        >>> estimator_1 = LogisticRegression()
+        >>> estimator_report_1 = EstimatorReport(
+        ...     estimator_1,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test
+        ... )
+        >>> estimator_2 = LogisticRegression(C=2)  # Different regularization
+        >>> estimator_report_2 = EstimatorReport(
+        ...     estimator_2,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test
+        ... )
+        >>> report = ComparisonReport([estimator_report_1, estimator_report_2])
+        >>> report.cache_predictions()
+        >>> report.clear_cache()
+        >>> report._cache
+        {}
+        """
+        for report in self.estimator_reports_:
+            report.clear_cache()
+        self._cache = {}
+
+    @progress_decorator(description="Estimator predictions")
+    def cache_predictions(self, response_methods="auto", n_jobs=None):
+        """Cache the predictions for sub-estimators reports.
+
+        Parameters
+        ----------
+        response_methods : {"auto", "predict", "predict_proba", "decision_function"},\
+                default="auto
+            The methods to use to compute the predictions.
+
+        n_jobs : int, default=None
+            The number of jobs to run in parallel. If `None`, we use the `n_jobs`
+            parameter when initializing the report.
+
+        Examples
+        --------
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.model_selection import train_test_split
+        >>> from skore import ComparisonReport
+        >>> X, y = make_classification(random_state=42)
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+        >>> estimator_1 = LogisticRegression()
+        >>> estimator_report_1 = EstimatorReport(
+        ...     estimator_1,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test
+        ... )
+        >>> estimator_2 = LogisticRegression(C=2)  # Different regularization
+        >>> estimator_report_2 = EstimatorReport(
+        ...     estimator_2,
+        ...     X_train=X_train,
+        ...     y_train=y_train,
+        ...     X_test=X_test,
+        ...     y_test=y_test
+        ... )
+        >>> report = ComparisonReport([estimator_report_1, estimator_report_2])
+        >>> report.cache_predictions()
+        >>> report._cache
+        {...}
+        """
+        if n_jobs is None:
+            n_jobs = self.n_jobs
+
+        progress = self._progress_info["current_progress"]
+        main_task = self._progress_info["current_task"]
+
+        total_estimators = len(self.estimator_reports_)
+        progress.update(main_task, total=total_estimators)
+
+        for estimator_report in self.estimator_reports_:
+            # Pass the progress manager to child tasks
+            estimator_report._parent_progress = progress
+            estimator_report.cache_predictions(
+                response_methods=response_methods, n_jobs=n_jobs
+            )
+            progress.update(main_task, advance=1, refresh=True)
+
     ####################################################################################
     # Methods related to the help and repr
     ####################################################################################

From 983106b56bacf292593e91cc350cf196a584776b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 20 Feb 2025 13:15:02 +0100
Subject: [PATCH 6/9] add flat_index to comparison report

---
 .../sklearn/_comparison/metrics_accessor.py   | 19 ++++++++++---
 skore/tests/unit/sklearn/test_comparison.py   | 28 +++++++++++++++++++
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/skore/src/skore/sklearn/_comparison/metrics_accessor.py b/skore/src/skore/sklearn/_comparison/metrics_accessor.py
index 7f07a3d0c..7ed812d76 100644
--- a/skore/src/skore/sklearn/_comparison/metrics_accessor.py
+++ b/skore/src/skore/sklearn/_comparison/metrics_accessor.py
@@ -7,6 +7,7 @@
 from skore.externals._pandas_accessors import DirNamesMixin
 from skore.sklearn._base import _BaseAccessor
 from skore.utils._accessor import _check_supported_ml_task
+from skore.utils._index import flatten_multi_index
 from skore.utils._progress_bar import progress_decorator
 
 
@@ -42,8 +43,9 @@ def report_metrics(
         y=None,
         scoring=None,
         scoring_names=None,
-        pos_label=None,
         scoring_kwargs=None,
+        pos_label=None,
+        flat_index=False,
     ):
         """Report a set of metrics for the estimators.
 
@@ -77,11 +79,14 @@ def report_metrics(
             Used to overwrite the default scoring names in the report. It should be of
             the same length as the ``scoring`` parameter.
 
+        scoring_kwargs : dict, default=None
+            The keyword arguments to pass to the scoring functions.
+
         pos_label : int, float, bool or str, default=None
             The positive class.
 
-        scoring_kwargs : dict, default=None
-            The keyword arguments to pass to the scoring functions.
+        flat_index : bool, default=False
+            Whether to flatten the `MultiIndex` columns.
 
         Returns
         -------
@@ -124,7 +129,7 @@ def report_metrics(
         Precision (↗︎)              0.96...             0.96...
         Recall (↗︎)                 0.97...             0.97...
         """
-        return self._compute_metric_scores(
+        results = self._compute_metric_scores(
             report_metric_name="report_metrics",
             data_source=data_source,
             X=X,
@@ -134,6 +139,12 @@ def report_metrics(
             scoring_kwargs=scoring_kwargs,
             scoring_names=scoring_names,
         )
+        if flat_index:
+            if isinstance(results.columns, pd.MultiIndex):
+                results.columns = flatten_multi_index(results.columns)
+            if isinstance(results.index, pd.MultiIndex):
+                results.index = flatten_multi_index(results.index)
+        return results
 
     @progress_decorator(description="Compute metric for each split")
     def _compute_metric_scores(
diff --git a/skore/tests/unit/sklearn/test_comparison.py b/skore/tests/unit/sklearn/test_comparison.py
index 622d06fd4..a1786e784 100644
--- a/skore/tests/unit/sklearn/test_comparison.py
+++ b/skore/tests/unit/sklearn/test_comparison.py
@@ -534,3 +534,31 @@ def test_comparison_report_custom_metric_X_y(binary_classification_model):
         y=y_test,
     )
     pd.testing.assert_frame_equal(result, expected)
+
+
+def test_cross_validation_report_flat_index(binary_classification_model):
+    """Check that the index is flattened when `flat_index` is True.
+
+    Since `pos_label` is None, then by default a MultiIndex would be returned.
+    Here, we force to have a single-index by passing `flat_index=True`.
+    """
+    estimator, X_train, X_test, y_train, y_test = binary_classification_model
+    report_1 = EstimatorReport(
+        estimator, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test
+    )
+    report_2 = EstimatorReport(
+        estimator, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test
+    )
+    report = ComparisonReport({"report_1": report_1, "report_2": report_2})
+    result = report.metrics.report_metrics(flat_index=True)
+    assert result.shape == (6, 2)
+    assert isinstance(result.index, pd.Index)
+    assert result.index.tolist() == [
+        "Precision (↗︎)_0",
+        "Precision (↗︎)_1",
+        "Recall (↗︎)_0",
+        "Recall (↗︎)_1",
+        "ROC AUC (↗︎)",
+        "Brier score (↘︎)",
+    ]
+    assert result.columns.tolist() == ["report_1", "report_2"]

From 45fc8c5b8a903544fd640901b81d64f15baf9dff Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 20 Feb 2025 13:18:50 +0100
Subject: [PATCH 7/9] new rule for space and #

---
 skore/src/skore/utils/_index.py      | 10 ++++++++--
 skore/tests/unit/utils/test_index.py | 14 +++++++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/skore/src/skore/utils/_index.py b/skore/src/skore/utils/_index.py
index 1c4e4ed57..92e6ea81a 100644
--- a/skore/src/skore/utils/_index.py
+++ b/skore/src/skore/utils/_index.py
@@ -5,7 +5,8 @@ def flatten_multi_index(index: pd.MultiIndex) -> pd.Index:
     """Flatten a pandas MultiIndex into a single-level Index.
 
     Flatten a pandas `MultiIndex` into a single-level Index by joining the levels
-    with underscores. Empty strings are skipped when joining.
+    with underscores. Empty strings are skipped when joining. Spaces are replaced by
+    an underscore and "#" are skipped.
 
     Parameters
     ----------
@@ -29,4 +30,9 @@ def flatten_multi_index(index: pd.MultiIndex) -> pd.Index:
     if not isinstance(index, pd.MultiIndex):
         raise ValueError("`index` must be a MultiIndex.")
 
-    return pd.Index(["_".join(filter(bool, map(str, values))) for values in index])
+    return pd.Index(
+        [
+            "_".join(filter(bool, map(str, values))).replace(" ", "_").replace("#", "")
+            for values in index
+        ]
+    )
diff --git a/skore/tests/unit/utils/test_index.py b/skore/tests/unit/utils/test_index.py
index 3969cb593..60e576f7f 100644
--- a/skore/tests/unit/utils/test_index.py
+++ b/skore/tests/unit/utils/test_index.py
@@ -24,10 +24,22 @@
         pytest.param(
             [("a@b", "1#2"), ("c&d", "3$4")],
             ["letter", "number"],
-            ["a@b_1#2", "c&d_3$4"],
+            ["a@b_12", "c&d_3$4"],
             id="special_chars",
         ),
         pytest.param([], ["letter", "number"], [], id="empty"),
+        pytest.param(
+            [("hello world", "a b"), ("space test", "x y")],
+            ["text", "more"],
+            ["hello_world_a_b", "space_test_x_y"],
+            id="spaces",
+        ),
+        pytest.param(
+            [("a#b#c", "1#2#3"), ("x#y", "5#6")],
+            ["text", "numbers"],
+            ["abc_123", "xy_56"],
+            id="hash_symbols",
+        ),
     ],
 )
 def test_flatten_multi_index(input_tuples, names, expected_values):

From 1ec034bd02e4d79f5ba8d15773bfc0c2147a23b0 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 20 Feb 2025 13:26:21 +0100
Subject: [PATCH 8/9] update tests and documentation

---
 .../sklearn/_comparison/metrics_accessor.py    |  3 ++-
 .../_cross_validation/metrics_accessor.py      |  3 ++-
 .../sklearn/_estimator/metrics_accessor.py     |  3 ++-
 skore/src/skore/utils/_index.py                |  5 ++++-
 skore/tests/unit/sklearn/test_comparison.py    | 12 ++++++------
 .../unit/sklearn/test_cross_validation.py      | 16 ++++++++--------
 skore/tests/unit/sklearn/test_estimator.py     | 12 ++++++------
 skore/tests/unit/utils/test_index.py           | 18 ++++++++++++------
 8 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/skore/src/skore/sklearn/_comparison/metrics_accessor.py b/skore/src/skore/sklearn/_comparison/metrics_accessor.py
index 7ed812d76..6e92a62d0 100644
--- a/skore/src/skore/sklearn/_comparison/metrics_accessor.py
+++ b/skore/src/skore/sklearn/_comparison/metrics_accessor.py
@@ -86,7 +86,8 @@ def report_metrics(
             The positive class.
 
         flat_index : bool, default=False
-            Whether to flatten the `MultiIndex` columns.
+            Whether to flatten the `MultiIndex` columns. Flat index will always be lower
+            case, do not include spaces and remove the hash symbol to ease indexing.
 
         Returns
         -------
diff --git a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py
index 87582939b..59b5f6c91 100644
--- a/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py
+++ b/skore/src/skore/sklearn/_cross_validation/metrics_accessor.py
@@ -85,7 +85,8 @@ def report_metrics(
             Function to aggregate the scores across the cross-validation splits.
 
         flat_index : bool, default=False
-            Whether to flatten the `MultiIndex` columns.
+            Whether to flatten the `MultiIndex` columns. Flat index will always be lower
+            case, do not include spaces and remove the hash symbol to ease indexing.
 
         Returns
         -------
diff --git a/skore/src/skore/sklearn/_estimator/metrics_accessor.py b/skore/src/skore/sklearn/_estimator/metrics_accessor.py
index 546d67d6f..9fae10bc6 100644
--- a/skore/src/skore/sklearn/_estimator/metrics_accessor.py
+++ b/skore/src/skore/sklearn/_estimator/metrics_accessor.py
@@ -92,7 +92,8 @@ def report_metrics(
             The positive class.
 
         flat_index : bool, default=False
-            Whether to flatten the multiindex columns.
+            Whether to flatten the multiindex columns. Flat index will always be lower
+            case, do not include spaces and remove the hash symbol to ease indexing.
 
         Returns
         -------
diff --git a/skore/src/skore/utils/_index.py b/skore/src/skore/utils/_index.py
index 92e6ea81a..5b6a76f66 100644
--- a/skore/src/skore/utils/_index.py
+++ b/skore/src/skore/utils/_index.py
@@ -32,7 +32,10 @@ def flatten_multi_index(index: pd.MultiIndex) -> pd.Index:
 
     return pd.Index(
         [
-            "_".join(filter(bool, map(str, values))).replace(" ", "_").replace("#", "")
+            "_".join(filter(bool, map(str, values)))
+            .replace(" ", "_")
+            .replace("#", "")
+            .lower()
             for values in index
         ]
     )
diff --git a/skore/tests/unit/sklearn/test_comparison.py b/skore/tests/unit/sklearn/test_comparison.py
index a1786e784..94add3cdf 100644
--- a/skore/tests/unit/sklearn/test_comparison.py
+++ b/skore/tests/unit/sklearn/test_comparison.py
@@ -554,11 +554,11 @@ def test_cross_validation_report_flat_index(binary_classification_model):
     assert result.shape == (6, 2)
     assert isinstance(result.index, pd.Index)
     assert result.index.tolist() == [
-        "Precision (↗︎)_0",
-        "Precision (↗︎)_1",
-        "Recall (↗︎)_0",
-        "Recall (↗︎)_1",
-        "ROC AUC (↗︎)",
-        "Brier score (↘︎)",
+        "precision_(↗︎)_0",
+        "precision_(↗︎)_1",
+        "recall_(↗︎)_0",
+        "recall_(↗︎)_1",
+        "roc_auc_(↗︎)",
+        "brier_score_(↘︎)",
     ]
     assert result.columns.tolist() == ["report_1", "report_2"]
diff --git a/skore/tests/unit/sklearn/test_cross_validation.py b/skore/tests/unit/sklearn/test_cross_validation.py
index 532b2e4ac..59f8f7bb2 100644
--- a/skore/tests/unit/sklearn/test_cross_validation.py
+++ b/skore/tests/unit/sklearn/test_cross_validation.py
@@ -231,16 +231,16 @@ def test_cross_validation_report_flat_index(binary_classification_data):
     assert result.shape == (6, 2)
     assert isinstance(result.index, pd.Index)
     assert result.index.tolist() == [
-        "Precision (↗︎)_0",
-        "Precision (↗︎)_1",
-        "Recall (↗︎)_0",
-        "Recall (↗︎)_1",
-        "ROC AUC (↗︎)",
-        "Brier score (↘︎)",
+        "precision_(↗︎)_0",
+        "precision_(↗︎)_1",
+        "recall_(↗︎)_0",
+        "recall_(↗︎)_1",
+        "roc_auc_(↗︎)",
+        "brier_score_(↘︎)",
     ]
     assert result.columns.tolist() == [
-        "RandomForestClassifier_Split #0",
-        "RandomForestClassifier_Split #1",
+        "randomforestclassifier_split_0",
+        "randomforestclassifier_split_1",
     ]
 
 
diff --git a/skore/tests/unit/sklearn/test_estimator.py b/skore/tests/unit/sklearn/test_estimator.py
index 35877b7e3..949d627f7 100644
--- a/skore/tests/unit/sklearn/test_estimator.py
+++ b/skore/tests/unit/sklearn/test_estimator.py
@@ -362,12 +362,12 @@ def test_estimator_report_flat_index(binary_classification_data):
     assert result.shape == (6, 1)
     assert isinstance(result.index, pd.Index)
     assert result.index.tolist() == [
-        "Precision (↗︎)_0",
-        "Precision (↗︎)_1",
-        "Recall (↗︎)_0",
-        "Recall (↗︎)_1",
-        "ROC AUC (↗︎)",
-        "Brier score (↘︎)",
+        "precision_(↗︎)_0",
+        "precision_(↗︎)_1",
+        "recall_(↗︎)_0",
+        "recall_(↗︎)_1",
+        "roc_auc_(↗︎)",
+        "brier_score_(↘︎)",
     ]
     assert result.columns.tolist() == ["RandomForestClassifier"]
 
diff --git a/skore/tests/unit/utils/test_index.py b/skore/tests/unit/utils/test_index.py
index 60e576f7f..1a305b23b 100644
--- a/skore/tests/unit/utils/test_index.py
+++ b/skore/tests/unit/utils/test_index.py
@@ -7,39 +7,45 @@
     "input_tuples, names, expected_values",
     [
         pytest.param(
-            [("a", 1), ("b", 2)], ["letter", "number"], ["a_1", "b_2"], id="basic"
+            [("A", 1), ("B", 2)], ["letter", "number"], ["a_1", "b_2"], id="basic"
         ),
         pytest.param(
-            [("a", 1, "x"), ("b", 2, "y")],
+            [("A", 1, "X"), ("B", 2, "Y")],
             ["letter", "number", "symbol"],
             ["a_1_x", "b_2_y"],
             id="multiple_levels",
         ),
         pytest.param(
-            [("a", None), (None, 2)],
+            [("A", None), (None, 2)],
             ["letter", "number"],
             ["a_nan", "nan_2.0"],
             id="none_values",
         ),
         pytest.param(
-            [("a@b", "1#2"), ("c&d", "3$4")],
+            [("A@B", "1#2"), ("C&D", "3$4")],
             ["letter", "number"],
             ["a@b_12", "c&d_3$4"],
             id="special_chars",
         ),
         pytest.param([], ["letter", "number"], [], id="empty"),
         pytest.param(
-            [("hello world", "a b"), ("space test", "x y")],
+            [("Hello World", "A B"), ("Space Test", "X Y")],
             ["text", "more"],
             ["hello_world_a_b", "space_test_x_y"],
             id="spaces",
         ),
         pytest.param(
-            [("a#b#c", "1#2#3"), ("x#y", "5#6")],
+            [("A#B#C", "1#2#3"), ("X#Y", "5#6")],
             ["text", "numbers"],
             ["abc_123", "xy_56"],
             id="hash_symbols",
         ),
+        pytest.param(
+            [("UPPER", "CASE"), ("MiXeD", "cAsE")],
+            ["text", "type"],
+            ["upper_case", "mixed_case"],
+            id="case_sensitivity",
+        ),
     ],
 )
 def test_flatten_multi_index(input_tuples, names, expected_values):

From 7e58ffe9ed3939f2c4ee94d711ead8afcd956057 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <guillaume@probabl.ai>
Date: Thu, 20 Feb 2025 14:40:44 +0100
Subject: [PATCH 9/9] merge conflict

---
 skore/tests/unit/sklearn/test_cross_validation.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/skore/tests/unit/sklearn/test_cross_validation.py b/skore/tests/unit/sklearn/test_cross_validation.py
index ecab041b5..c26cae511 100644
--- a/skore/tests/unit/sklearn/test_cross_validation.py
+++ b/skore/tests/unit/sklearn/test_cross_validation.py
@@ -231,12 +231,12 @@ def test_cross_validation_report_flat_index(binary_classification_data):
     assert result.shape == (6, 2)
     assert isinstance(result.index, pd.Index)
     assert result.index.tolist() == [
-        "precision_(↗︎)_0",
-        "precision_(↗︎)_1",
-        "recall_(↗︎)_0",
-        "recall_(↗︎)_1",
-        "roc_auc_(↗︎)",
-        "brier_score_(↘︎)",
+        "precision_0",
+        "precision_1",
+        "recall_0",
+        "recall_1",
+        "roc_auc",
+        "brier_score",
     ]
     assert result.columns.tolist() == [
         "randomforestclassifier_split_0",