From 80e682609bcce6cb01c6a547b6ed7471cf6c1199 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 17 Jan 2022 22:16:10 +0100 Subject: [PATCH 1/2] API duck-typing for n_neighbors in CNN and deprecate estimator_ --- .../_condensed_nearest_neighbour.py | 66 ++++++++++++------- .../tests/test_condensed_nearest_neighbour.py | 12 +++- 2 files changed, 52 insertions(+), 26 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index df4afce76..e58c4ba4f 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -6,14 +6,16 @@ # License: MIT from collections import Counter +from numbers import Integral import numpy as np from scipy.sparse import issparse -from sklearn.base import clone +from sklearn.base import clone, is_classifier from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import check_random_state, _safe_indexing +from sklearn.utils.deprecation import deprecated from ..base import BaseCleaningSampler from ...utils import Substitution @@ -58,9 +60,16 @@ class CondensedNearestNeighbour(BaseCleaningSampler): corresponds to the class labels from which to sample and the values are the number of samples to sample. + n_neighbors_ : estimator object + The validated K-nearest neighbor estimator created from `n_neighbors` parameter. + estimator_ : estimator object The validated K-nearest neighbor estimator created from `n_neighbors` parameter. + .. deprecated:: 0.10 + `estimator_` is deprecated in 0.10 and will be removed in 0.12. + Use `n_neighbors_` instead. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. @@ -94,18 +103,17 @@ class CondensedNearestNeighbour(BaseCleaningSampler): Examples -------- - >>> from collections import Counter # doctest: +SKIP - >>> from sklearn.datasets import fetch_mldata # doctest: +SKIP + >>> from collections import Counter + >>> from sklearn.datasets import load_breast_cancer >>> from imblearn.under_sampling import \ -CondensedNearestNeighbour # doctest: +SKIP - >>> pima = fetch_mldata('diabetes_scale') # doctest: +SKIP - >>> X, y = pima['data'], pima['target'] # doctest: +SKIP - >>> print('Original dataset shape %s' % Counter(y)) # doctest: +SKIP - Original dataset shape Counter({{1: 500, -1: 268}}) # doctest: +SKIP - >>> cnn = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP - >>> X_res, y_res = cnn.fit_resample(X, y) #doctest: +SKIP - >>> print('Resampled dataset shape %s' % Counter(y_res)) # doctest: +SKIP - Resampled dataset shape Counter({{-1: 268, 1: 227}}) # doctest: +SKIP +CondensedNearestNeighbour + >>> X, y = load_breast_cancer(return_X_y=True) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 357, 0: 212}}) + >>> cnn = CondensedNearestNeighbour(random_state=42) + >>> X_res, y_res = cnn.fit_resample(X, y) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{0: 212, 1: 50}}) """ @_deprecate_positional_args @@ -125,20 +133,20 @@ def __init__( self.n_jobs = n_jobs def _validate_estimator(self): - """Private function to create the NN estimator""" if self.n_neighbors is None: - self.estimator_ = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs) - elif isinstance(self.n_neighbors, int): - self.estimator_ = KNeighborsClassifier( + self.n_neighbors_ = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs) + elif isinstance(self.n_neighbors, Integral): + self.n_neighbors_ = KNeighborsClassifier( n_neighbors=self.n_neighbors, n_jobs=self.n_jobs ) - elif isinstance(self.n_neighbors, KNeighborsClassifier): - self.estimator_ = clone(self.n_neighbors) + elif is_classifier(self.n_neighbors) and hasattr( + self.n_neighbors, "n_neighbors" + ): + self.n_neighbors_ = clone(self.n_neighbors) else: raise ValueError( - f"`n_neighbors` has to be a int or an object" - f" inhereited from KNeighborsClassifier." - f" Got {type(self.n_neighbors)} instead." + "`n_neighbors` must be an integer or a KNN classifier having an " + f"attribute `n_neighbors`. Got {self.n_neighbors!r} instead." ) def _fit_resample(self, X, y): @@ -175,7 +183,7 @@ def _fit_resample(self, X, y): S_y = _safe_indexing(y, S_indices) # fit knn on C - self.estimator_.fit(C_x, C_y) + self.n_neighbors_.fit(C_x, C_y) good_classif_label = idx_maj_sample.copy() # Check each sample in S if we keep it or drop it @@ -188,7 +196,7 @@ def _fit_resample(self, X, y): # Classify on S if not issparse(x_sam): x_sam = x_sam.reshape(1, -1) - pred_y = self.estimator_.predict(x_sam) + pred_y = self.n_neighbors_.predict(x_sam) # If the prediction do not agree with the true label # append it in C_x @@ -202,12 +210,12 @@ def _fit_resample(self, X, y): C_y = _safe_indexing(y, C_indices) # fit a knn on C - self.estimator_.fit(C_x, C_y) + self.n_neighbors_.fit(C_x, C_y) # This experimental to speed up the search # Classify all the element in S and avoid to test the # well classified elements - pred_S_y = self.estimator_.predict(S_x) + pred_S_y = self.n_neighbors_.predict(S_x) good_classif_label = np.unique( np.append(idx_maj_sample, np.flatnonzero(pred_S_y == S_y)) ) @@ -224,3 +232,11 @@ def _fit_resample(self, X, y): def _more_tags(self): return {"sample_indices": True} + + @deprecated( + "`estimator_` is deprecated in version 0.10 and will be removed in version " + "0.12. Use `n_neighbors_` instead." + ) + @property + def estimator_(self): + return self.n_neighbors_ diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py index 1c683e5fd..be4fa4d38 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py @@ -101,5 +101,15 @@ def test_cnn_fit_resample_with_object(): def test_cnn_fit_resample_with_wrong_object(): knn = "rnd" cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) - with pytest.raises(ValueError, match="has to be a int or an "): + msg = "`n_neighbors` must be an integer or a KNN classifier" + with pytest.raises(ValueError, match=msg): cnn.fit_resample(X, Y) + + +def test_cnn_estimator_deprecation(): + cnn = CondensedNearestNeighbour(random_state=RND_SEED) + cnn.fit_resample(X, Y) + + msg = "`estimator_` is deprecated in version 0.10" + with pytest.warns(FutureWarning, match=msg): + assert cnn.estimator_ == cnn.n_neighbors_ From ba898fbaddb9c1ab39708d5fb4d4da09e109ff1a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 17 Jan 2022 22:25:07 +0100 Subject: [PATCH 2/2] iter --- doc/whats_new/v0.10.rst | 10 ++++++++++ .../_condensed_nearest_neighbour.py | 6 +++--- .../tests/test_condensed_nearest_neighbour.py | 5 +++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.10.rst b/doc/whats_new/v0.10.rst index 96bbd5671..a990b0b65 100644 --- a/doc/whats_new/v0.10.rst +++ b/doc/whats_new/v0.10.rst @@ -19,6 +19,12 @@ Deprecation estimator where `n_jobs` is set. :pr:`887` by :user:`Guillaume Lemaitre `. +- The fitted attribute `estimator_` in + :class:`~imblearn.under_sampling.CondensedNearestNeighbour` + has been deprecated and will be removed in 0.12. Instead, use the + `n_neighbors_` fitted attribute. + :pr:`891` by :user:`Guillaume Lemaitre `. + Enhancements ............ @@ -26,3 +32,7 @@ Enhancements duck-typing. For instance, it allows to accept cuML instances. :pr:`858` by :user:`NV-jpt ` and :user:`Guillaume Lemaitre `. + +- Add support to accept compatible `KNearestNeighbors` objects that can be + clone and have an attribute `n_neighbors`. + :pr:`891` by :user:`Guillaume Lemaitre `. diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index e58c4ba4f..ad89a7d81 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -233,9 +233,9 @@ def _fit_resample(self, X, y): def _more_tags(self): return {"sample_indices": True} - @deprecated( - "`estimator_` is deprecated in version 0.10 and will be removed in version " - "0.12. Use `n_neighbors_` instead." + @deprecated( # type: ignore + "`estimator_` is deprecated in version 0.10 and will be " + "removed in version 0.12. Use `n_neighbors_` instead." ) @property def estimator_(self): diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py index be4fa4d38..0ddb3dacf 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py @@ -113,3 +113,8 @@ def test_cnn_estimator_deprecation(): msg = "`estimator_` is deprecated in version 0.10" with pytest.warns(FutureWarning, match=msg): assert cnn.estimator_ == cnn.n_neighbors_ + + +def test_cnn_custom_knn(): + # FIXME: accept any arbitrary KNN classifier + pass