Merge remote-tracking branch 'upstream/main' into dark

DeaMariaLeon · Feb 17, 2025 · dc422c5 · dc422c5
2 parents adec057 + 033a46c
commit dc422c5
Show file tree

Hide file tree

Showing 14 changed files with 57 additions and 16 deletions.
diff --git a/.binder/postBuild b/.binder/postBuild
@@ -6,9 +6,9 @@ set -e
 # inside a git checkout of the scikit-learn/scikit-learn repo. This script is
 # generating notebooks from the scikit-learn python examples.
 
-if [[ ! -f /.dockerenv ]]; then
-    echo "This script was written for repo2docker and is supposed to run inside a docker container."
-    echo "Exiting because this script can delete data if run outside of a docker container."
+if [[ -z "${REPO_DIR}" ]]; then
+    echo "This script was written for repo2docker and the REPO_DIR environment variable is supposed to be set."
+    echo "Exiting because this script can delete data if run outside of a repo2docker context."
     exit 1
 fi
 

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
@@ -140,6 +140,11 @@ model with equal covariance per component.
 :term:`inductive` clustering methods) are not designed to be applied to new,
 unseen data.
 
+.. rubric:: Examples
+
+* :ref:`sphx_glr_auto_examples_cluster_plot_inductive_clustering.py`: An example
+  of an inductive clustering model for handling new data.
+
 .. _k_means:
 
 K-means

diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
@@ -398,9 +398,6 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
 
     Notes
     -----
-    For an example usage,
-    see :ref:`sphx_glr_auto_examples_cluster_plot_affinity_propagation.py`.
-
     The algorithmic complexity of affinity propagation is quadratic
     in the number of points.
 
@@ -442,6 +439,12 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
     >>> clustering.cluster_centers_
     array([[1, 2],
            [4, 2]])
+
+    For an example usage,
+    see :ref:`sphx_glr_auto_examples_cluster_plot_affinity_propagation.py`.
+
+    For a comparison of Affinity Propagation with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
     """
 
     _parameter_constraints: dict = {

diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
@@ -925,6 +925,9 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
     AgglomerativeClustering()
     >>> clustering.labels_
     array([1, 1, 1, 0, 0, 0])
+
+    For a comparison of Agglomerative clustering with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
     """
 
     _parameter_constraints: dict = {

diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
@@ -483,6 +483,9 @@ class Birch(
     Birch(n_clusters=None)
     >>> brc.predict(X)
     array([0, 0, 0, 1, 1, 1])
+
+    For a comparison of the BIRCH clustering algorithm with other clustering algorithms,
+    see :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
     """
 
     _parameter_constraints: dict = {

diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
@@ -277,9 +277,6 @@ class DBSCAN(ClusterMixin, BaseEstimator):
 
     Notes
     -----
-    For an example, see
-    :ref:`sphx_glr_auto_examples_cluster_plot_dbscan.py`.
-
     This implementation bulk-computes all neighborhood queries, which increases
     the memory complexity to O(n.d) where d is the average number of neighbors,
     while original DBSCAN had memory complexity O(n). It may attract a higher
@@ -322,6 +319,12 @@ class DBSCAN(ClusterMixin, BaseEstimator):
     array([ 0,  0,  0,  1,  1, -1])
     >>> clustering
     DBSCAN(eps=3, min_samples=2)
+
+    For an example, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_dbscan.py`.
+
+    For a comparison of DBSCAN with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
     """
 
     _parameter_constraints: dict = {

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -427,10 +427,6 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
     :class:`~sklearn.cluster.DBSCAN`), and be more robust to parameter selection.
     Read more in the :ref:`User Guide <hdbscan>`.
 
-    For an example of how to use HDBSCAN, as well as a comparison to
-    :class:`~sklearn.cluster.DBSCAN`, please see the :ref:`plotting demo
-    <sphx_glr_auto_examples_cluster_plot_hdbscan.py>`.
-
     .. versionadded:: 1.3
 
     Parameters

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
@@ -1873,6 +1873,9 @@ class MiniBatchKMeans(_BaseKMeans):
            [1.06896552, 1.        ]])
     >>> kmeans.predict([[0, 0], [4, 4]])
     array([1, 0], dtype=int32)
+
+    For a comparison of Mini-Batch K-Means clustering with other clustering algorithms,
+    see :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
     """
 
     _parameter_constraints: dict = {

diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
@@ -432,6 +432,9 @@ class MeanShift(ClusterMixin, BaseEstimator):
     array([1, 0])
     >>> clustering
     MeanShift(bandwidth=2)
+
+    For a comparison of Mean Shift clustering with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
     """
 
     _parameter_constraints: dict = {

diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
@@ -234,6 +234,9 @@ class OPTICS(ClusterMixin, BaseEstimator):
 
     For a more detailed example see
     :ref:`sphx_glr_auto_examples_cluster_plot_optics.py`.
+
+    For a comparison of OPTICS with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
     """
 
     _parameter_constraints: dict = {

diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
@@ -601,6 +601,9 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
     >>> clustering
     SpectralClustering(assign_labels='discretize', n_clusters=2,
         random_state=0)
+
+    For a comparison of Spectral clustering with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
     """
 
     _parameter_constraints: dict = {

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -1492,7 +1492,7 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
         ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
         is always reserved for missing values. Must be no larger than 255.
     categorical_features : array-like of {bool, int, str} of shape (n_features) \
-            or shape (n_categorical_features,), default=None
+            or shape (n_categorical_features,), default='from_dtype'
         Indicates the categorical features.
 
         - None : no feature will be considered categorical.
@@ -1880,7 +1880,7 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
         ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
         is always reserved for missing values. Must be no larger than 255.
     categorical_features : array-like of {bool, int, str} of shape (n_features) \
-            or shape (n_categorical_features,), default=None
+            or shape (n_categorical_features,), default='from_dtype'
         Indicates the categorical features.
 
         - None : no feature will be considered categorical.

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
@@ -1611,7 +1611,7 @@ def test_multiclass_sample_weight_invariance(name):
 @pytest.mark.parametrize(
     "name",
     sorted(
-        (MULTILABELS_METRICS | THRESHOLDED_MULTILABEL_METRICS | MULTIOUTPUT_METRICS)
+        (MULTILABELS_METRICS | THRESHOLDED_MULTILABEL_METRICS)
         - METRICS_WITHOUT_SAMPLE_WEIGHT
     ),
 )
@@ -1638,6 +1638,19 @@ def test_multilabel_sample_weight_invariance(name):
         check_sample_weight_invariance(name, metric, y_true, y_pred)
 
 
+@pytest.mark.parametrize(
+    "name",
+    sorted(MULTIOUTPUT_METRICS - METRICS_WITHOUT_SAMPLE_WEIGHT),
+)
+def test_multioutput_sample_weight_invariance(name):
+    random_state = check_random_state(0)
+    y_true = random_state.uniform(0, 2, size=(20, 5))
+    y_pred = random_state.uniform(0, 2, size=(20, 5))
+
+    metric = ALL_METRICS[name]
+    check_sample_weight_invariance(name, metric, y_true, y_pred)
+
+
 def test_no_averaging_labels():
     # test labels argument when not using averaging
     # in multi-class and multi-label cases

diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py
@@ -693,6 +693,9 @@ class GaussianMixture(BaseMixture):
            [ 1.,  2.]])
     >>> gm.predict([[0, 0], [12, 3]])
     array([1, 0])
+
+    For a comparison of Gaussian Mixture with other clustering algorithms, see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py`
     """
 
     _parameter_constraints: dict = {