From 653289dd2901a6dfe80e00754da9fbe63cf67db6 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Tue, 28 May 2024 08:09:28 +0200 Subject: [PATCH 01/75] [enhancement, testing] add initial array_api testing capabilities using array-api-strict (#1800) * Update requirements-test.txt * Update _dataframes_support.py * Update _dataframes_support.py * Update _dataframes_support.py * Update _dataframes_support.py * Update _dataframes_support.py * formatting * isort * formatting * Update requirements-test.txt * Update conftest.py * formatting * Update onedal/tests/utils/_dataframes_support.py Co-authored-by: ethanglaser <42726565+ethanglaser@users.noreply.github.com> * Update requirements-test.txt --------- Co-authored-by: ethanglaser <42726565+ethanglaser@users.noreply.github.com> --- onedal/tests/utils/_dataframes_support.py | 51 +++++++++++++++++++---- requirements-test.txt | 1 + sklearnex/conftest.py | 12 +++++- 3 files changed, 56 insertions(+), 8 deletions(-) diff --git a/onedal/tests/utils/_dataframes_support.py b/onedal/tests/utils/_dataframes_support.py index 23632bb249..cfc40ae021 100644 --- a/onedal/tests/utils/_dataframes_support.py +++ b/onedal/tests/utils/_dataframes_support.py @@ -15,6 +15,7 @@ # =============================================================================== import pytest +from sklearn import get_config try: import dpctl @@ -31,6 +32,24 @@ except ImportError: dpnp_available = False +try: + # This should be lazy imported in the + # future along with other popular + # array_api libraries when testing + # GPU-no-copy. + import array_api_strict + + # Run check if "array_api_dispatch" is configurable + array_api_enabled = lambda: get_config()["array_api_dispatch"] + array_api_enabled() + array_api_modules = {"array_api": array_api_strict} + + +except (ImportError, KeyError): + array_api_enabled = lambda: False + array_api_modules = {} + + import numpy as np import pandas as pd @@ -41,6 +60,7 @@ def get_dataframes_and_queues( dataframe_filter_="numpy,pandas,dpnp,dpctl", device_filter_="cpu,gpu" ): dataframes_and_queues = [] + if "numpy" in dataframe_filter_: dataframes_and_queues.append(pytest.param("numpy", None, id="numpy")) if "pandas" in dataframe_filter_: @@ -57,6 +77,9 @@ def get_df_and_q(dataframe: str): dataframes_and_queues.extend(get_df_and_q("dpctl")) if dpnp_available and "dpnp" in dataframe_filter_: dataframes_and_queues.extend(get_df_and_q("dpnp")) + if "array_api" in dataframe_filter_ or array_api_enabled(): + dataframes_and_queues.append(pytest.param("array_api", None, id="array_api")) + return dataframes_and_queues @@ -73,12 +96,12 @@ def _as_numpy(obj, *args, **kwargs): def _convert_to_dataframe(obj, sycl_queue=None, target_df=None, *args, **kwargs): if target_df is None: return obj - # Numpy ndarray. - # `sycl_queue` arg is ignored. - if target_df == "numpy": + elif target_df == "numpy": + # Numpy ndarray. + # `sycl_queue` arg is ignored. return np.asarray(obj, *args, **kwargs) # Pandas Dataframe - if target_df == "pandas": + elif target_df == "pandas": if ( "dtype" in kwargs and hasattr(obj, "astype") @@ -91,11 +114,25 @@ def _convert_to_dataframe(obj, sycl_queue=None, target_df=None, *args, **kwargs) else: return pd.DataFrame(obj, *args, **kwargs) # DPNP ndarray. - if target_df == "dpnp": + elif target_df == "dpnp": return dpnp.asarray( obj, usm_type="device", sycl_queue=sycl_queue, *args, **kwargs ) - # DPCtl tensor. - if target_df == "dpctl": + elif target_df == "dpctl": + # DPCtl tensor. return dpt.asarray(obj, usm_type="device", sycl_queue=sycl_queue, *args, **kwargs) + elif target_df in array_api_modules: + # use dpctl to define gpu devices via queues and + # move data to the device. This is necessary as + # the standard for defining devices is + # purposefully not defined in the array_api + # standard, but maintaining data on a device + # using the method `from_dlpack` is. + xp = array_api_modules[target_df] + return xp.from_dlpack( + _convert_to_dataframe( + obj, sycl_queue=sycl_queue, target_df="dpctl", *args, **kwargs + ) + ) + raise RuntimeError("Unsupported dataframe conversion") diff --git a/requirements-test.txt b/requirements-test.txt index 51f1689494..448e73a53b 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -12,3 +12,4 @@ lightgbm==4.3.0 catboost==1.2.5 shap==0.44.1 ; python_version == '3.8' shap==0.45.1 ; python_version >= '3.9' +array-api-strict==1.1.1 ; python_version >= '3.9' diff --git a/sklearnex/conftest.py b/sklearnex/conftest.py index 20d1ace0ee..b70165dfe9 100644 --- a/sklearnex/conftest.py +++ b/sklearnex/conftest.py @@ -19,7 +19,8 @@ import pytest -from sklearnex import patch_sklearn, unpatch_sklearn +from daal4py.sklearn._utils import sklearn_check_version +from sklearnex import config_context, patch_sklearn, unpatch_sklearn def pytest_configure(config): @@ -61,3 +62,12 @@ def with_sklearnex(): patch_sklearn() yield unpatch_sklearn() + + +@pytest.fixture +def with_array_api(): + if sklearn_check_version("1.2"): + with config_context(array_api_dispatch=True): + yield + else: + yield From 16e7ded84eaec32a4547709335995358545b860a Mon Sep 17 00:00:00 2001 From: Maria Petrova Date: Tue, 28 May 2024 17:29:10 +0200 Subject: [PATCH 02/75] Update requests version (#1847) --- requirements-doc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-doc.txt b/requirements-doc.txt index 1469a2e8c2..07bd79512e 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -47,7 +47,7 @@ python-dateutil==2.8.2 pytz==2024.1 PyYAML==6.0.1 pyzmq==26.0.0 -requests==2.31.0 +requests==2.32.0 six==1.16.0 snowballstemmer==2.2.0 soupsieve==2.5 From e1e13c67d3f1ba006a8340129ee5b1ec70cdce3b Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Tue, 28 May 2024 22:30:48 +0200 Subject: [PATCH 03/75] [enhancement] block use of sklearn_check_version in `onedal/` (#1829) * initial version * Update forest.py * Update _forest.py * Update forest.py * Update forest.py * remove is_classification and format * updates * missing saves * missing imports * remove vestigial code * CI fixes * make pretty * remove sklearn_check_version * formatting * fix issues * remove unnecessary testing done in sklearns' test_common.py * formatting * Update svm.py * Update _common.py * Update _common.py * interim fixes * fix self.gamma * fixes * further fixes * attempts at fixing probability checks * fix issues with queue passing * modify to remove queue * formatting * remove validate_targets * Revert "remove validate_targets" This reverts commit 154f289d41c47029ac4aaf447db1137a2548b9b7. * attempt to not touch X and y * only don't modify y * deal with class_weight_ * fix gpu offload for probabilty=true * unnecessary check * correct float32 issues * readd max features check * simplify * add check array for sample_weights * further fix _check_array * fix test miss * remove unnecessary check * fix imports * fixes * update import * fixes * fix mistake * move class_weights back to onedal * hopefully last fix * fix logic * Update svm.py * Update svm.py * Update _common.py * Update nusvc.py --- onedal/tests/test_common.py | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 onedal/tests/test_common.py diff --git a/onedal/tests/test_common.py b/onedal/tests/test_common.py new file mode 100644 index 0000000000..2746af802a --- /dev/null +++ b/onedal/tests/test_common.py @@ -0,0 +1,41 @@ +# ============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +from glob import glob + +import pytest + + +def test_sklearn_check_version_ban(): + """This test blocks the use of sklearn_check_version + in onedal files. The versioning should occur in the + sklearnex package for clarity and maintainability. + """ + from onedal import __file__ as loc + + path = loc.replace("__init__.py", "") + files = [y for x in os.walk(path) for y in glob(os.path.join(x[0], "*.py"))] + + output = [] + + for f in files: + if open(f, "r").read().find("sklearn_check_version") != -1: + output += [f.replace(path, "onedal" + os.sep)] + + # remove this file from the list + output = "\n".join([i for i in output if "test_common.py" not in i]) + assert output == "", f"sklearn versioning is occuring in: \n{output}" From d5ff95ad7e377b62e38c0aaebf95ce5c442a4028 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Tue, 28 May 2024 22:31:24 +0200 Subject: [PATCH 04/75] [enhancement] block use of target_offload in `/sklearnex` (#1839) * target_offload ban * unblock svc --- sklearnex/tests/test_common.py | 48 ++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 sklearnex/tests/test_common.py diff --git a/sklearnex/tests/test_common.py b/sklearnex/tests/test_common.py new file mode 100644 index 0000000000..b035a3e9a2 --- /dev/null +++ b/sklearnex/tests/test_common.py @@ -0,0 +1,48 @@ +# ============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +from glob import glob + +import pytest + +ALLOWED_LOCATIONS = ["_config.py", "_device_offload.py", "test", "svc.py"] + + +def test_target_offload_ban(): + """This test blocks the use of target_offload in + in sklearnex files. Offloading computation to devices + via target_offload should only occur externally, and not + within the architecture of the sklearnex classes. This + is for clarity, traceability and maintainability. + """ + from sklearnex import __file__ as loc + + path = loc.replace("__init__.py", "") + files = [y for x in os.walk(path) for y in glob(os.path.join(x[0], "*.py"))] + + output = [] + + for f in files: + if open(f, "r").read().find("target_offload") != -1: + output += [f.replace(path, "sklearnex" + os.sep)] + + # remove this file from the list + for allowed in ALLOWED_LOCATIONS: + output = [i for i in output if allowed not in i] + + output = "\n".join(output) + assert output == "", f"sklearn versioning is occuring in: \n{output}" From c67fc705edeb8814f671e63d55e755c63d617360 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 29 May 2024 10:13:37 +0200 Subject: [PATCH 05/75] [bug, testing] fix `sklearnex/` target_offload ban (#1849) * Update test_common.py * Update test_common.py * formatting * fix windows --- sklearnex/tests/test_common.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sklearnex/tests/test_common.py b/sklearnex/tests/test_common.py index b035a3e9a2..311c1cd285 100644 --- a/sklearnex/tests/test_common.py +++ b/sklearnex/tests/test_common.py @@ -19,7 +19,13 @@ import pytest -ALLOWED_LOCATIONS = ["_config.py", "_device_offload.py", "test", "svc.py"] +ALLOWED_LOCATIONS = [ + "_config.py", + "_device_offload.py", + "test", + "svc.py", + "svm" + os.sep + "_common.py", +] def test_target_offload_ban(): From 1257ce5300a85de9865109a3c57864566af6fcbb Mon Sep 17 00:00:00 2001 From: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Date: Thu, 30 May 2024 09:08:46 -0700 Subject: [PATCH 06/75] DEPS: numpy 2.0 support (#1817) * update numpy for investigation * 0b1 version * update to rc2 * numpy 2.0 migration * same adjustment for daal4py.cpp * test requirements compatibility updates * shap/numba workaround * update ComplexWarning import path * conditional import * lint * update VisibleDeprecationWarning import path * lint * conditional import * catboost condition * deselect public CI fail * deselect public CI tests in public CI... * typo * trying mismatching build/test versions * return to previous * build with latest supported * test versions exact * revert previous * expanding numpy validation for py3.8 * revert previous * update import path conditional logic * more flexible numpy 2.0 --- .ci/scripts/run_sklearn_tests.sh | 1 + daal4py/sklearn/utils/validation.py | 9 ++++++++- dependencies-dev | 6 ++---- deselected_tests.yaml | 2 ++ onedal/utils/validation.py | 16 ++++++++++++---- requirements-test.txt | 10 ++++++---- src/daal4py.cpp | 6 +++++- src/npy4daal.h | 16 +++++++++++----- 8 files changed, 47 insertions(+), 19 deletions(-) diff --git a/.ci/scripts/run_sklearn_tests.sh b/.ci/scripts/run_sklearn_tests.sh index 1ff00d24ca..2c9234627b 100755 --- a/.ci/scripts/run_sklearn_tests.sh +++ b/.ci/scripts/run_sklearn_tests.sh @@ -25,6 +25,7 @@ cd $ci_dir # ('all' - special value to run all tests) export SELECTED_TESTS=${SELECTED_TESTS:-$(python scripts/select_sklearn_tests.py)} +export DESELECT_FLAGS="--public ${DESELECT_FLAGS}" if [ -n "${SKLEARNEX_PREVIEW}" ]; then export DESELECT_FLAGS="--preview ${DESELECT_FLAGS}" fi diff --git a/daal4py/sklearn/utils/validation.py b/daal4py/sklearn/utils/validation.py index 44bdf0adfd..260b5e2c61 100644 --- a/daal4py/sklearn/utils/validation.py +++ b/daal4py/sklearn/utils/validation.py @@ -19,7 +19,14 @@ import numpy as np import scipy.sparse as sp -from numpy.core.numeric import ComplexWarning + +if np.lib.NumpyVersion(np.__version__) >= np.lib.NumpyVersion("2.0.0a0"): + # numpy_version >= 2.0 + from numpy.exceptions import ComplexWarning +else: + # numpy_version < 2.0 + from numpy.core.numeric import ComplexWarning + from sklearn import get_config as _get_config from sklearn.utils.extmath import _safe_accumulator_op from sklearn.utils.fixes import _object_dtype_isnan diff --git a/dependencies-dev b/dependencies-dev index 162eaa86a8..d30c78447a 100644 --- a/dependencies-dev +++ b/dependencies-dev @@ -1,8 +1,6 @@ Cython==3.0.10 Jinja2==3.1.4 -numpy==1.19.5 ; python_version <= '3.9' -numpy==1.21.6 ; python_version == '3.10' -numpy==1.23.5 ; python_version == '3.11' -numpy==1.26.4 ; python_version >= '3.12' +numpy==1.19.5 ; python_version < '3.9' +numpy==2.0.0rc2 ; python_version >= '3.9' pybind11==2.12.0 cmake==3.29.3 diff --git a/deselected_tests.yaml b/deselected_tests.yaml index eeee9c9b4f..4a29bfa487 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -452,6 +452,8 @@ public: # Failed in stock scikit-learn - metrics/tests/test_common.py::test_not_symmetric_metric[precision_recall_curve] - metrics/tests/test_common.py::test_binary_sample_weight_invariance[precision_recall_curve] + # Fails from numpy 2.0 and sklearn 1.4+ + - neighbors/tests/test_neighbors.py::test_KNeighborsClassifier_raise_on_all_zero_weights # -------------------------------------------------------- # The following tests currently fail with GPU offload diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index f33a754db9..bc9f07db4e 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -20,6 +20,14 @@ import numpy as np from scipy import sparse as sp + +if np.lib.NumpyVersion(np.__version__) >= np.lib.NumpyVersion("2.0.0a0"): + # numpy_version >= 2.0 + from numpy.exceptions import VisibleDeprecationWarning +else: + # numpy_version < 2.0 + from numpy import VisibleDeprecationWarning + from sklearn.preprocessing import LabelEncoder from sklearn.utils.validation import check_array @@ -233,10 +241,10 @@ def _type_of_target(y): # DeprecationWarning will be replaced by ValueError, see NEP 34 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html with warnings.catch_warnings(): - warnings.simplefilter("error", np.VisibleDeprecationWarning) + warnings.simplefilter("error", VisibleDeprecationWarning) try: y = np.asarray(y) - except np.VisibleDeprecationWarning: + except VisibleDeprecationWarning: # dtype=object should be provided explicitly for ragged arrays, # see NEP 34 y = np.asarray(y, dtype=object) @@ -290,10 +298,10 @@ def _is_multilabel(y): # DeprecationWarning will be replaced by ValueError, see NEP 34 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html with warnings.catch_warnings(): - warnings.simplefilter("error", np.VisibleDeprecationWarning) + warnings.simplefilter("error", VisibleDeprecationWarning) try: y = np.asarray(y) - except np.VisibleDeprecationWarning: + except VisibleDeprecationWarning: # dtype=object should be provided explicitly for ragged arrays, # see NEP 34 y = np.array(y, dtype=object) diff --git a/requirements-test.txt b/requirements-test.txt index 448e73a53b..3a1070b36a 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -2,14 +2,16 @@ pytest==7.4.4 ; python_version == '3.8' pytest==8.2.1 ; python_version >= '3.9' numpy>=1.19.5 ; python_version <= '3.9' numpy>=1.21.6 ; python_version == '3.10' -numpy>=1.23.5 ; python_version >= '3.11' +numpy>=1.23.5 ; python_version == '3.11' +numpy>=2.0.0rc2 ; python_version >= '3.12' scikit-learn==1.2.2 ; python_version == '3.8' scikit-learn==1.4.2 ; python_version >= '3.9' pandas==2.0.3 ; python_version == '3.8' -pandas==2.1.3 ; python_version >= '3.9' +pandas==2.1.3 ; python_version >= '3.9' and python_version < '3.11' +pandas==2.2.2 ; python_version >= '3.11' xgboost==2.0.3 lightgbm==4.3.0 -catboost==1.2.5 +catboost==1.2.5 ; python_version < '3.12' # TODO: Remove 3.12 condition when catboost supports numpy 2.0 shap==0.44.1 ; python_version == '3.8' -shap==0.45.1 ; python_version >= '3.9' +shap==0.45.1 ; python_version >= '3.9' and python_version < '3.12' # TODO: Remove 3.12 condition when shap/numba support numpy 2.0 array-api-strict==1.1.1 ; python_version >= '3.9' diff --git a/src/daal4py.cpp b/src/daal4py.cpp index 60e15f1ffb..61f0f61742 100755 --- a/src/daal4py.cpp +++ b/src/daal4py.cpp @@ -23,6 +23,10 @@ #include "npy4daal.h" #include "daal4py_defines.h" +#if NPY_ABI_VERSION < 0x02000000 + #define PyDataType_NAMES(descr) ((descr)->names) +#endif + // ************************************************************************************ // ************************************************************************************ // Numpy type conversion code, taken from numpy.i (SWIG typemap-code) @@ -352,7 +356,7 @@ static daal::data_management::NumericTablePtr _make_npynt(PyObject * nda) else if (array_numdims(nda) == 1) { PyArray_Descr * descr = PyArray_DESCR(array); - if (descr->names) + if (PyDataType_NAMES(descr)) { // the given array is a structured numpy array. ptr = new NpyNumericTable(array); diff --git a/src/npy4daal.h b/src/npy4daal.h index 71c2c58a13..181dcc5689 100644 --- a/src/npy4daal.h +++ b/src/npy4daal.h @@ -29,6 +29,12 @@ #define PyInt_AsSsize_t PyLong_AsSsize_t #endif +#if NPY_ABI_VERSION < 0x02000000 + #define PyDataType_ELSIZE(descr) ((descr)->elsize) + #define PyDataType_NAMES(descr) ((descr)->names) + #define PyDataType_FIELDS(descr) ((descr)->fields) +#endif + #define SET_NPY_FEATURE( _T, _M, _E ) \ switch(_T) { \ @@ -200,7 +206,7 @@ class NpyNonContigHandler // The location of the inner loop size which the iterator may update npy_intp * innersizeptr = NpyIter_GetInnerLoopSizePtr(iter); - if(NpyIter_GetDescrArray(iter)[0]->elsize != sizeof(T)) { + if(PyDataType_ELSIZE(NpyIter_GetDescrArray(iter)[0]) != sizeof(T)) { NpyIter_Deallocate(iter); PyGILState_Release(__state); throw std::invalid_argument("Encountered unexpected element size or type when copying block."); @@ -254,7 +260,7 @@ class NpyStructHandler // e.g. each element is a tuple. PyArray_Descr * descr = PyArray_DESCR(ary); // type descriptor - if(!descr->names) { + if(!PyDataType_NAMES(descr)) { throw std::invalid_argument("No dtype argument provided. Unable to create AOSNumericTable."); } if(PyArray_NDIM(ary) != 1) { @@ -263,7 +269,7 @@ class NpyStructHandler + std::string(" dimensions, extected 1 for a strctured array. Don't know how to create NumericTable.")); } - PyObject * fnames = PySequence_Fast(descr->names, NULL); // list of names of tuple-elements + PyObject * fnames = PySequence_Fast(PyDataType_NAMES(descr), NULL); // list of names of tuple-elements Py_ssize_t N = PySequence_Fast_GET_SIZE(fnames); // number of elements in tuple auto _ddict = daal::data_management::NumericTableDictionaryPtr(new daal::data_management::NumericTableDictionary(N)); @@ -272,7 +278,7 @@ class NpyStructHandler // get their type and init ddict feature accordingly for (Py_ssize_t i=0; ifields, name); // desr->fields is a dict + PyObject * ftr = PyObject_GetItem(PyDataType_FIELDS(descr), name); // PyDataType_FIELDS(descr) is a dict if(!PyTuple_Check(ftr)) { throw std::invalid_argument(std::string("Found invalid dtype in structured numpy array, expected tuple, got ") + std::string(PyString_AsString(PyObject_Str(PyObject_Type(ftr))))); @@ -298,7 +304,7 @@ class NpyStructHandler { auto __state = PyGILState_Ensure(); // tuple elements are identified by name, need the list of names - PyObject * fnames = PySequence_Fast(PyArray_DESCR(ary)->names, NULL); + PyObject * fnames = PySequence_Fast(PyDataType_NAMES(PyArray_DESCR(ary)), NULL); for( size_t j = 0; j < ncols ; ++j ) { PyObject * name = PySequence_Fast_GET_ITEM(fnames, j); // get column by name From a1d36e454bacb3c6a60957161c68372eded630e2 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 4 Jun 2024 15:29:45 +0100 Subject: [PATCH 07/75] Remove main conda channel from Nightly and Release (#1841) * Remove main conda channel from Nightly and Release * Change the way of main conda channel removal --- .ci/pipeline/nightly.yml | 4 +++- .ci/pipeline/release.yml | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.ci/pipeline/nightly.yml b/.ci/pipeline/nightly.yml index 0f37f2acb9..f425e14787 100644 --- a/.ci/pipeline/nightly.yml +++ b/.ci/pipeline/nightly.yml @@ -61,8 +61,10 @@ jobs: vmImage: 'ubuntu-latest' steps: - script: | + conda config --append channels conda-forge + conda config --remove channels defaults conda update -y -q conda - conda create -y -q -n CB -c intel python=$(python.version) dal-devel impi-devel + conda create -y -q -n CB -c intel -c conda-forge python=$(python.version) dal-devel impi-devel displayName: 'Conda create' - script: | bash .ci/scripts/describe_system.sh diff --git a/.ci/pipeline/release.yml b/.ci/pipeline/release.yml index 0dedeabdec..c4f611ddb8 100644 --- a/.ci/pipeline/release.yml +++ b/.ci/pipeline/release.yml @@ -45,7 +45,7 @@ jobs: displayName: 'Sklearn testing' - job: GeneratorConda steps: - - bash: python .ci/scripts/gen_release_jobs.py --channels main intel conda-forge + - bash: python .ci/scripts/gen_release_jobs.py --channels intel conda-forge name: MatrixGen - job: ReleaseConda dependsOn: GeneratorConda @@ -68,7 +68,9 @@ jobs: condition: eq( variables['Agent.OS'], 'Darwin') displayName: Add sudo access - script: | - conda update -y -q -c defaults conda + conda config --append channels conda-forge + conda config --remove channels defaults + conda update -y -q conda conda create -y -q -n CB -c $(conda.channel) python=$(python.version) scikit-learn-intelex pandas pytest pyyaml displayName: 'Install scikit-learn-intelex' - script: | From 56b10f2f30b2339042099382f1a8400c00acb2b4 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Tue, 4 Jun 2024 14:17:55 -0700 Subject: [PATCH 08/75] Update dependency pytest to v8.2.2 (#1852) * Update dependency pytest to v8.2.2 * Update requirements-test.txt --------- Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: ethanglaser <42726565+ethanglaser@users.noreply.github.com> --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 3a1070b36a..cc70aa05b0 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,5 @@ pytest==7.4.4 ; python_version == '3.8' -pytest==8.2.1 ; python_version >= '3.9' +pytest==8.2.2 ; python_version >= '3.9' numpy>=1.19.5 ; python_version <= '3.9' numpy>=1.21.6 ; python_version == '3.10' numpy>=1.23.5 ; python_version == '3.11' From d4e650d466da379ac12a646dd1e86d204cdbc6b0 Mon Sep 17 00:00:00 2001 From: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Date: Tue, 4 Jun 2024 21:38:39 -0700 Subject: [PATCH 09/75] CI: deselect failing extratrees tests (#1853) * CI: deselect failing extratrees tests * lint * oops * one more --- deselected_tests.yaml | 6 ++++++ sklearnex/ensemble/tests/test_forest.py | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 4a29bfa487..0dd53be456 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -727,6 +727,12 @@ gpu: - tests/test_common.py::test_estimators[ExtraTreesClassifier()-check_class_weight_classifiers] - tests/test_common.py::test_estimators[ExtraTreesRegressor()-check_sample_weights_invariance(kind=zeros)] - tests/test_common.py::test_estimators[RandomForestRegressor()-check_regressor_data_not_an_array] + - ensemble/tests/test_forest.py::test_max_samples_boundary_classifiers[ExtraTreesClassifier] + - tests/test_common.py::test_estimators[ExtraTreesClassifier()-check_classifier_data_not_an_array] + - tests/test_common.py::test_estimators[ExtraTreesClassifier()-check_classifiers_train] + - tests/test_common.py::test_estimators[ExtraTreesClassifier()-check_classifiers_train(readonly_memmap=True)] + - tests/test_common.py::test_estimators[ExtraTreesClassifier()-check_fit_idempotent] + - tests/test_common.py::test_estimators[ExtraTreesRegressor()-check_fit_idempotent] # GPU implementation of Extra Trees doesn't support sample_weights # comparisons to GPU with sample weights will use different algorithms diff --git a/sklearnex/ensemble/tests/test_forest.py b/sklearnex/ensemble/tests/test_forest.py index 80dec7dce7..a1d30b4d93 100644 --- a/sklearnex/ensemble/tests/test_forest.py +++ b/sklearnex/ensemble/tests/test_forest.py @@ -86,7 +86,10 @@ def test_sklearnex_import_et_classifier(dataframe, queue): assert_allclose([1], _as_numpy(rf.predict([[0, 0, 0, 0]]))) -@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +# TODO: fix ET regressor predict for the GPU sycl_queue. +@pytest.mark.parametrize( + "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu") +) def test_sklearnex_import_et_regression(dataframe, queue): from sklearnex.ensemble import ExtraTreesRegressor From c354eba8d106f95deab09fba12649c0b71f4a677 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 5 Jun 2024 15:20:12 +0100 Subject: [PATCH 10/75] Add sklearn 1.5 to CI matrix (#1842) * Add sklearn 1.5 to CI * Deselect PCA solver selection tests * Update LogisticRegression parameter * Remove py38 from CI matrix * Fix for global patch: add temp. _threadpool_controller in dispatcher * Change pytest versions * Update test reqs * Remove unnecessary check * Set skl 1.0 py310 in CI matrix and revert some previous changes * Change sklearn 1.0 python version * Skip dpctl/dpnp install for sklearn 1.0 * CHange pytest versions --- .ci/pipeline/build-and-test-lnx.yml | 2 +- .ci/pipeline/ci.yml | 14 ++++++++++---- deselected_tests.yaml | 5 +++++ requirements-test.txt | 6 +++--- sklearnex/glob/dispatcher.py | 18 ++++++++++++++++-- sklearnex/linear_model/logistic_regression.py | 2 +- 6 files changed, 36 insertions(+), 11 deletions(-) diff --git a/.ci/pipeline/build-and-test-lnx.yml b/.ci/pipeline/build-and-test-lnx.yml index a31c17d4c1..4aa56b4e72 100644 --- a/.ci/pipeline/build-and-test-lnx.yml +++ b/.ci/pipeline/build-and-test-lnx.yml @@ -46,7 +46,7 @@ steps: bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION) pip install --upgrade -r requirements-test.txt pip install $(python .ci/scripts/get_compatible_scipy_version.py) - if [ $(echo $(PYTHON_VERSION) | grep '3.9\|3.10') ]; then conda install -q -y -c intel dpctl=0.16.0 dpnp=0.14.0; fi + if [ $(echo $(PYTHON_VERSION) | grep '3.9\|3.10') ] && [ $(SKLEARN_VERSION) != "1.0" ]; then conda install -q -y -c intel dpctl=0.16.0 dpnp=0.14.0; fi pip list displayName: "Install testing requirements" - script: | diff --git a/.ci/pipeline/ci.yml b/.ci/pipeline/ci.yml index 42613ddc59..3a71b3323e 100644 --- a/.ci/pipeline/ci.yml +++ b/.ci/pipeline/ci.yml @@ -60,8 +60,8 @@ jobs: timeoutInMinutes: 120 strategy: matrix: - Python3.8_Sklearn1.0: - PYTHON_VERSION: '3.8' + Python3.9_Sklearn1.0: + PYTHON_VERSION: '3.9' SKLEARN_VERSION: '1.0' Python3.9_Sklearn1.1: PYTHON_VERSION: '3.9' @@ -75,6 +75,9 @@ jobs: Python3.12_Sklearn1.4: PYTHON_VERSION: '3.12' SKLEARN_VERSION: '1.4' + Python3.12_Sklearn1.5: + PYTHON_VERSION: '3.12' + SKLEARN_VERSION: '1.5' pool: vmImage: 'ubuntu-22.04' steps: @@ -84,8 +87,8 @@ jobs: timeoutInMinutes: 120 strategy: matrix: - Python3.8_Sklearn1.0: - PYTHON_VERSION: '3.8' + Python3.9_Sklearn1.0: + PYTHON_VERSION: '3.9' SKLEARN_VERSION: '1.0' Python3.9_Sklearn1.1: PYTHON_VERSION: '3.9' @@ -99,6 +102,9 @@ jobs: Python3.12_Sklearn1.4: PYTHON_VERSION: '3.12' SKLEARN_VERSION: '1.4' + Python3.12_Sklearn1.5: + PYTHON_VERSION: '3.12' + SKLEARN_VERSION: '1.5' pool: vmImage: 'windows-latest' steps: diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 0dd53be456..cd153b1f71 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -29,6 +29,11 @@ deselected_tests: - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-kulsinski] <1.3 - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[kulsinski] <1.3 + # sklearnex PCA always chooses "covariance_eigh" solver instead of "full" when solver="auto" + # resulting in solver assignment check failure for sklearn version >= 1.5 + - decomposition/tests/test_pca.py::test_pca_svd_solver_auto[1000-500-400-full] >=1.5 + - decomposition/tests/test_pca.py::test_pca_svd_solver_auto[1000-500-0.5-full] >=1.5 + # test for KMeans FutureWarning is not removed from sklearn tests suit yet - cluster/tests/test_k_means.py::test_change_n_init_future_warning[KMeans-10] ==1.4.dev0 diff --git a/requirements-test.txt b/requirements-test.txt index cc70aa05b0..247ab2be08 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,11 +1,11 @@ -pytest==7.4.4 ; python_version == '3.8' -pytest==8.2.2 ; python_version >= '3.9' +pytest==7.4.4 ; python_version <= '3.9' +pytest==8.2.2 ; python_version >= '3.10' numpy>=1.19.5 ; python_version <= '3.9' numpy>=1.21.6 ; python_version == '3.10' numpy>=1.23.5 ; python_version == '3.11' numpy>=2.0.0rc2 ; python_version >= '3.12' scikit-learn==1.2.2 ; python_version == '3.8' -scikit-learn==1.4.2 ; python_version >= '3.9' +scikit-learn==1.5.0 ; python_version >= '3.9' pandas==2.0.3 ; python_version == '3.8' pandas==2.1.3 ; python_version >= '3.9' and python_version < '3.11' pandas==2.2.2 ; python_version >= '3.11' diff --git a/sklearnex/glob/dispatcher.py b/sklearnex/glob/dispatcher.py index a78586f39f..47967d63e3 100755 --- a/sklearnex/glob/dispatcher.py +++ b/sklearnex/glob/dispatcher.py @@ -17,18 +17,32 @@ def get_patch_str(name=None, verbose=True): return f"""try: + # TEMP. FIX: sklearnex.patch_sklearn imports sklearn beforehand + # when it didn't initialized _threadpool_controller required for + # pairwise distances dispatching during imports. + # Manually setting and deleting _threadpool_controller during patch fixes it. + import sklearn + from threadpoolctl import ThreadpoolController + sklearn._threadpool_controller = ThreadpoolController() from sklearnex import patch_sklearn patch_sklearn(name={str(name)}, verbose={str(verbose)}) - del patch_sklearn + del patch_sklearn, sklearn._threadpool_controller except ImportError: pass""" def get_patch_str_re(): return r"""\ntry: + \# TEMP. FIX: sklearnex.patch_sklearn imports sklearn beforehand + \# when it didn't initialized _threadpool_controller required for + \# pairwise distances dispatching during imports. + \# Manually setting and deleting _threadpool_controller during patch fixes it. + import sklearn + from threadpoolctl import ThreadpoolController + sklearn._threadpool_controller = ThreadpoolController\(\) from sklearnex import patch_sklearn patch_sklearn\(name=.*, verbose=.*\) - del patch_sklearn + del patch_sklearn, sklearn._threadpool_controller except ImportError: pass\n""" diff --git a/sklearnex/linear_model/logistic_regression.py b/sklearnex/linear_model/logistic_regression.py index 4495877f23..2e0bf609bc 100644 --- a/sklearnex/linear_model/logistic_regression.py +++ b/sklearnex/linear_model/logistic_regression.py @@ -82,7 +82,7 @@ def __init__( random_state=None, solver="lbfgs", max_iter=100, - multi_class="auto", + multi_class="deprecated" if sklearn_check_version("1.5") else "auto", verbose=0, warm_start=False, n_jobs=None, From 4a69c5e0d9db5b5688c61d600beef073dd2ebe8b Mon Sep 17 00:00:00 2001 From: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Date: Wed, 5 Jun 2024 16:53:06 -0700 Subject: [PATCH 11/75] Revert "Add sklearn 1.5 to CI matrix (#1842)" (#1854) This reverts commit c354eba8d106f95deab09fba12649c0b71f4a677. --- .ci/pipeline/build-and-test-lnx.yml | 2 +- .ci/pipeline/ci.yml | 14 ++++---------- deselected_tests.yaml | 5 ----- requirements-test.txt | 6 +++--- sklearnex/glob/dispatcher.py | 18 ++---------------- sklearnex/linear_model/logistic_regression.py | 2 +- 6 files changed, 11 insertions(+), 36 deletions(-) diff --git a/.ci/pipeline/build-and-test-lnx.yml b/.ci/pipeline/build-and-test-lnx.yml index 4aa56b4e72..a31c17d4c1 100644 --- a/.ci/pipeline/build-and-test-lnx.yml +++ b/.ci/pipeline/build-and-test-lnx.yml @@ -46,7 +46,7 @@ steps: bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION) pip install --upgrade -r requirements-test.txt pip install $(python .ci/scripts/get_compatible_scipy_version.py) - if [ $(echo $(PYTHON_VERSION) | grep '3.9\|3.10') ] && [ $(SKLEARN_VERSION) != "1.0" ]; then conda install -q -y -c intel dpctl=0.16.0 dpnp=0.14.0; fi + if [ $(echo $(PYTHON_VERSION) | grep '3.9\|3.10') ]; then conda install -q -y -c intel dpctl=0.16.0 dpnp=0.14.0; fi pip list displayName: "Install testing requirements" - script: | diff --git a/.ci/pipeline/ci.yml b/.ci/pipeline/ci.yml index 3a71b3323e..42613ddc59 100644 --- a/.ci/pipeline/ci.yml +++ b/.ci/pipeline/ci.yml @@ -60,8 +60,8 @@ jobs: timeoutInMinutes: 120 strategy: matrix: - Python3.9_Sklearn1.0: - PYTHON_VERSION: '3.9' + Python3.8_Sklearn1.0: + PYTHON_VERSION: '3.8' SKLEARN_VERSION: '1.0' Python3.9_Sklearn1.1: PYTHON_VERSION: '3.9' @@ -75,9 +75,6 @@ jobs: Python3.12_Sklearn1.4: PYTHON_VERSION: '3.12' SKLEARN_VERSION: '1.4' - Python3.12_Sklearn1.5: - PYTHON_VERSION: '3.12' - SKLEARN_VERSION: '1.5' pool: vmImage: 'ubuntu-22.04' steps: @@ -87,8 +84,8 @@ jobs: timeoutInMinutes: 120 strategy: matrix: - Python3.9_Sklearn1.0: - PYTHON_VERSION: '3.9' + Python3.8_Sklearn1.0: + PYTHON_VERSION: '3.8' SKLEARN_VERSION: '1.0' Python3.9_Sklearn1.1: PYTHON_VERSION: '3.9' @@ -102,9 +99,6 @@ jobs: Python3.12_Sklearn1.4: PYTHON_VERSION: '3.12' SKLEARN_VERSION: '1.4' - Python3.12_Sklearn1.5: - PYTHON_VERSION: '3.12' - SKLEARN_VERSION: '1.5' pool: vmImage: 'windows-latest' steps: diff --git a/deselected_tests.yaml b/deselected_tests.yaml index cd153b1f71..0dd53be456 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -29,11 +29,6 @@ deselected_tests: - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-kulsinski] <1.3 - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[kulsinski] <1.3 - # sklearnex PCA always chooses "covariance_eigh" solver instead of "full" when solver="auto" - # resulting in solver assignment check failure for sklearn version >= 1.5 - - decomposition/tests/test_pca.py::test_pca_svd_solver_auto[1000-500-400-full] >=1.5 - - decomposition/tests/test_pca.py::test_pca_svd_solver_auto[1000-500-0.5-full] >=1.5 - # test for KMeans FutureWarning is not removed from sklearn tests suit yet - cluster/tests/test_k_means.py::test_change_n_init_future_warning[KMeans-10] ==1.4.dev0 diff --git a/requirements-test.txt b/requirements-test.txt index 247ab2be08..cc70aa05b0 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,11 +1,11 @@ -pytest==7.4.4 ; python_version <= '3.9' -pytest==8.2.2 ; python_version >= '3.10' +pytest==7.4.4 ; python_version == '3.8' +pytest==8.2.2 ; python_version >= '3.9' numpy>=1.19.5 ; python_version <= '3.9' numpy>=1.21.6 ; python_version == '3.10' numpy>=1.23.5 ; python_version == '3.11' numpy>=2.0.0rc2 ; python_version >= '3.12' scikit-learn==1.2.2 ; python_version == '3.8' -scikit-learn==1.5.0 ; python_version >= '3.9' +scikit-learn==1.4.2 ; python_version >= '3.9' pandas==2.0.3 ; python_version == '3.8' pandas==2.1.3 ; python_version >= '3.9' and python_version < '3.11' pandas==2.2.2 ; python_version >= '3.11' diff --git a/sklearnex/glob/dispatcher.py b/sklearnex/glob/dispatcher.py index 47967d63e3..a78586f39f 100755 --- a/sklearnex/glob/dispatcher.py +++ b/sklearnex/glob/dispatcher.py @@ -17,32 +17,18 @@ def get_patch_str(name=None, verbose=True): return f"""try: - # TEMP. FIX: sklearnex.patch_sklearn imports sklearn beforehand - # when it didn't initialized _threadpool_controller required for - # pairwise distances dispatching during imports. - # Manually setting and deleting _threadpool_controller during patch fixes it. - import sklearn - from threadpoolctl import ThreadpoolController - sklearn._threadpool_controller = ThreadpoolController() from sklearnex import patch_sklearn patch_sklearn(name={str(name)}, verbose={str(verbose)}) - del patch_sklearn, sklearn._threadpool_controller + del patch_sklearn except ImportError: pass""" def get_patch_str_re(): return r"""\ntry: - \# TEMP. FIX: sklearnex.patch_sklearn imports sklearn beforehand - \# when it didn't initialized _threadpool_controller required for - \# pairwise distances dispatching during imports. - \# Manually setting and deleting _threadpool_controller during patch fixes it. - import sklearn - from threadpoolctl import ThreadpoolController - sklearn._threadpool_controller = ThreadpoolController\(\) from sklearnex import patch_sklearn patch_sklearn\(name=.*, verbose=.*\) - del patch_sklearn, sklearn._threadpool_controller + del patch_sklearn except ImportError: pass\n""" diff --git a/sklearnex/linear_model/logistic_regression.py b/sklearnex/linear_model/logistic_regression.py index 2e0bf609bc..4495877f23 100644 --- a/sklearnex/linear_model/logistic_regression.py +++ b/sklearnex/linear_model/logistic_regression.py @@ -82,7 +82,7 @@ def __init__( random_state=None, solver="lbfgs", max_iter=100, - multi_class="deprecated" if sklearn_check_version("1.5") else "auto", + multi_class="auto", verbose=0, warm_start=False, n_jobs=None, From 7b5eeee27e007ed58c9b7ebea1c3f1175f66bd3b Mon Sep 17 00:00:00 2001 From: Anatoly Volkov <117643568+avolkov-intel@users.noreply.github.com> Date: Thu, 6 Jun 2024 03:47:08 -0700 Subject: [PATCH 12/75] Fix incorrect numpy to table conversion on windows (#1851) * Update SET_NPY_FEATURE * Remove debug outputs, add tests * Update numpy_helpers * Minor test update * Remove changes for dpctl --- onedal/datatypes/data_conversion.cpp | 4 +++- onedal/datatypes/numpy_helpers.hpp | 17 ++++++++++++++--- onedal/datatypes/tests/test_data.py | 18 ++++++++++++++++++ 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/onedal/datatypes/data_conversion.cpp b/onedal/datatypes/data_conversion.cpp index 0d7ceea6a2..7723c64232 100644 --- a/onedal/datatypes/data_conversion.cpp +++ b/onedal/datatypes/data_conversion.cpp @@ -154,7 +154,8 @@ dal::table convert_to_table(PyObject *obj) { PyArrayObject *ary = reinterpret_cast(obj); if (array_is_behaved(ary) || array_is_behaved_F(ary)) { #define MAKE_HOMOGEN_TABLE(CType) res = convert_to_homogen_impl(ary); - SET_NPY_FEATURE(PyArray_DESCR(ary)->type, + SET_NPY_FEATURE(array_type(ary), + array_type_sizeof(ary), MAKE_HOMOGEN_TABLE, throw std::invalid_argument("Found unsupported array type")); #undef MAKE_HOMOGEN_TABLE @@ -207,6 +208,7 @@ dal::table convert_to_table(PyObject *obj) { row_count, \ column_count); SET_NPY_FEATURE(array_type(np_data), + array_type_sizeof(np_data), MAKE_CSR_TABLE, throw std::invalid_argument("Found unsupported data type in csr_matrix")); #undef MAKE_CSR_TABLE diff --git a/onedal/datatypes/numpy_helpers.hpp b/onedal/datatypes/numpy_helpers.hpp index 6f9e5d8f6a..b43801259d 100644 --- a/onedal/datatypes/numpy_helpers.hpp +++ b/onedal/datatypes/numpy_helpers.hpp @@ -65,7 +65,7 @@ default: _EXCEPTION; \ }; -#define SET_NPY_FEATURE(_T, _FUNCT, _EXCEPTION) \ +#define SET_NPY_FEATURE(_T, _S, _FUNCT, _EXCEPTION) \ switch (_T) { \ case NPY_FLOAT: \ case NPY_CFLOAT: \ @@ -91,23 +91,34 @@ _FUNCT(std::uint32_t); \ break; \ } \ - case NPY_LONGLTR: \ case NPY_LONGLONGLTR: \ case NPY_INT64: { \ _FUNCT(std::int64_t); \ break; \ } \ - case NPY_ULONGLTR: \ case NPY_ULONGLONGLTR: \ case NPY_UINT64: { \ _FUNCT(std::uint64_t); \ break; \ } \ + case NPY_LONGLTR: {\ + if (_S == 4) {_FUNCT(std::int32_t);} \ + else if (_S == 8) {_FUNCT(std::int64_t);} \ + else {_EXCEPTION;} \ + break; \ + } \ + case NPY_ULONGLTR: {\ + if (_S == 4) {_FUNCT(std::uint32_t);} \ + else if (_S == 8) {_FUNCT(std::uint64_t);} \ + else {_EXCEPTION;} \ + break; \ + }\ default: _EXCEPTION; \ }; #define is_array(a) ((a) && PyArray_Check(a)) #define array_type(a) PyArray_TYPE((PyArrayObject *)a) +#define array_type_sizeof(a) PyArray_ITEMSIZE((PyArrayObject *)a) #define array_is_behaved(a) (PyArray_ISCARRAY_RO((PyArrayObject *)a) && array_type(a) < NPY_OBJECT) #define array_is_behaved_F(a) \ (PyArray_ISFARRAY_RO((PyArrayObject *)a) && array_type(a) < NPY_OBJECT) diff --git a/onedal/datatypes/tests/test_data.py b/onedal/datatypes/tests/test_data.py index dfea7d3c39..97c4b9ccf2 100644 --- a/onedal/datatypes/tests/test_data.py +++ b/onedal/datatypes/tests/test_data.py @@ -19,6 +19,7 @@ from numpy.testing import assert_allclose from onedal import _backend +from onedal.datatypes import from_table, to_table from onedal.primitives import linear_kernel from onedal.tests.utils._device_selection import get_queues @@ -144,6 +145,23 @@ def test_input_format_f_contiguous_pandas(queue, dtype): _test_input_format_f_contiguous_pandas(queue, dtype) +def _test_conversion_to_table(dtype): + np.random.seed() + if dtype in [np.int32, np.int64]: + x = np.random.randint(0, 10, (15, 3), dtype=dtype) + else: + x = np.random.uniform(-2, 2, (18, 6)).astype(dtype) + x_table = to_table(x) + x2 = from_table(x_table) + assert x.dtype == x2.dtype + assert np.array_equal(x, x2) + + +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +def test_conversion_to_table(dtype): + _test_conversion_to_table(dtype) + + # TODO: # Currently `dpctl_to_table` is not used in onedal estimators. # The test will be enabled after future data management update, that brings From c87586df9c439f40dad4d0fe52357fda757907d7 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Thu, 6 Jun 2024 16:02:41 +0200 Subject: [PATCH 13/75] [testing] refactor test_memory_usage.py (#1776) * Update _dataframes_support.py * Update _dataframes_support.py * Update _dataframes_support.py * Update _dataframes_support.py * Update _dataframes_support.py * Update test_linear.py * Update test_neighbors.py * Update test_patching.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * formatting * avoid float to int * small fixes * Update test_linear.py * Update test_patching.py * investigate assert failure * Update _dataframes_support.py * Update _utils.py * Update test_patching.py * Update test_patching.py * Update policy_common.cpp * Update policy_common.cpp * Update policy.cpp * Update policy_common.hpp * make similar to get_traced_memory * introduce gpu memory tracing * isort fixes * interim results * interim results * this is broken but I don't want to fix it now * this is broken but I don't want to fix it now * add functions to estimators * simplify * better naming scheme * _backend necessary * a better setting of environment variables * legibility * fixing mistake * black format * add inspect isclass * fix issues * change function interface * remove dpnp * simplify if statement * wrap_output_data on KFolds.split * black formatting * generalize take * add NuSVC probability=True to ban list * formatting * Update install_dpcpp.sh * Update build-and-test-lnx.yml * separate gpu testing * add reason text * separate cpu and gpu * reformatting * fix speed issues associated with _fit_validation * forgot NearestNeighbors * add _device_offload fixes * change roc_auc_score * return to 755 modified in recent pulled PR * avoid extratrees on GPU temporarily * remove relative import * to config context * formatting * Update test_memory_usage.py * Update test_memory_usage.py * Update test_memory_usage.py * Update test_memory_usage.py * swap orders for stability testing * Update test_memory_usage.py * Update test_memory_usage.py * Update test_memory_usage.py * formatting * Update test_memory_usage.py * explicit check: * Update test_patching.py * Update incremental_basic_statistics.py * Update test_incremental_basic_statistics.py * Update incremental_basic_statistics.py * Update test_memory_usage.py * Update _namespace.py * Update _utils.py * Update test_linear.py * Update test_memory_usage.py * Check to see fallback estimators * Update test_memory_usage.py * Update test_memory_usage.py * Update test_memory_usage.py * linting * newtown to newton * Update test_memory_usage.py * Update test_memory_usage.py * formatting * isort * updates based off of review --- daal4py/sklearn/svm/svm.py | 0 onedal/common/policy.cpp | 1 + onedal/common/policy_common.cpp | 7 + onedal/common/policy_common.hpp | 1 + sklearnex/tests/test_memory_usage.py | 310 ++++++++++++++++----------- sklearnex/utils/_namespace.py | 2 +- 6 files changed, 194 insertions(+), 127 deletions(-) mode change 100755 => 100644 daal4py/sklearn/svm/svm.py diff --git a/daal4py/sklearn/svm/svm.py b/daal4py/sklearn/svm/svm.py old mode 100755 new mode 100644 diff --git a/onedal/common/policy.cpp b/onedal/common/policy.cpp index 8be2e7f17a..9bf46e0909 100644 --- a/onedal/common/policy.cpp +++ b/onedal/common/policy.cpp @@ -62,6 +62,7 @@ void instantiate_data_parallel_policy(py::module& m) { policy.def("get_device_name", [](const data_parallel_policy_t& policy) { return get_device_name(policy); }); + m.def("get_used_memory", &get_used_memory, py::return_value_policy::take_ownership); } #endif // ONEDAL_DATA_PARALLEL diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index a6db9376de..36cb87c400 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -121,6 +121,13 @@ std::uint32_t get_device_id(const sycl::queue& queue) { } } +std::size_t get_used_memory(const py::object& syclobj){ + const auto& device = get_queue_from_python(syclobj).get_device(); + std::size_t total_memory = device.get_info(); + std::size_t free_memory = device.get_info(); + return total_memory - free_memory; +} + dp_policy_t make_dp_policy(std::uint32_t id) { sycl::queue queue = get_queue_by_device_id(id); return dp_policy_t{ std::move(queue) }; diff --git a/onedal/common/policy_common.hpp b/onedal/common/policy_common.hpp index df5576900f..90a832fa40 100644 --- a/onedal/common/policy_common.hpp +++ b/onedal/common/policy_common.hpp @@ -51,6 +51,7 @@ inline dp_policy_t make_dp_policy(const dp_policy_t& policy) { } std::uint32_t get_device_id(const dp_policy_t& policy); +std::size_t get_used_memory(const py::object& syclobj); std::string get_device_name(const dp_policy_t& policy); #endif // ONEDAL_DATA_PARALLEL diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index 503a46abe3..b072fd7814 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -14,127 +14,113 @@ # limitations under the License. # ============================================================================== - import gc import logging +import os import tracemalloc import types +import warnings +from inspect import isclass import numpy as np import pandas as pd import pytest from scipy.stats import pearsonr -from sklearn.base import BaseEstimator +from sklearn.base import BaseEstimator, clone from sklearn.datasets import make_classification from sklearn.model_selection import KFold -from sklearnex import get_patch_map -from sklearnex.metrics import pairwise_distances, roc_auc_score -from sklearnex.model_selection import train_test_split -from sklearnex.utils import _assert_all_finite - - -class TrainTestSplitEstimator: - def __init__(self): - pass - - def fit(self, x, y): - train_test_split(x, y) - - -class FiniteCheckEstimator: - def __init__(self): - pass - - def fit(self, x, y): - _assert_all_finite(x) - _assert_all_finite(y) - - -class PairwiseDistancesEstimator: - def fit(self, x, y): - pairwise_distances(x, metric=self.metric) - - -class CosineDistancesEstimator(PairwiseDistancesEstimator): - def __init__(self): - self.metric = "cosine" - - -class CorrelationDistancesEstimator(PairwiseDistancesEstimator): - def __init__(self): - self.metric = "correlation" - - -class RocAucEstimator: - def __init__(self): - pass - - def fit(self, x, y): - print(roc_auc_score(y, np.zeros(shape=y.shape, dtype=np.int32))) - - -# add all daal4py estimators enabled in patching (except banned) - - -def get_patched_estimators(ban_list, output_list): - patched_estimators = get_patch_map().values() - for listing in patched_estimators: - estimator, name = listing[0][0][2], listing[0][0][1] - if not isinstance(estimator, types.FunctionType): - if name not in ban_list: - if issubclass(estimator, BaseEstimator): - if hasattr(estimator, "fit"): - output_list.append(estimator) - +from onedal import _is_dpc_backend +from onedal.tests.utils._dataframes_support import ( + _convert_to_dataframe, + get_dataframes_and_queues, +) +from onedal.tests.utils._device_selection import get_queues, is_dpctl_available +from sklearnex import config_context +from sklearnex.tests._utils import PATCHED_FUNCTIONS, PATCHED_MODELS, SPECIAL_INSTANCES +from sklearnex.utils import get_namespace -def remove_duplicated_estimators(estimators_list): - estimators_map = {} - for estimator in estimators_list: - full_name = f"{estimator.__module__}.{estimator.__name__}" - estimators_map[full_name] = estimator - return estimators_map.values() +if _is_dpc_backend: + from onedal import _backend -BANNED_ESTIMATORS = ( +CPU_SKIP_LIST = ( + "TSNE", # too slow for using in testing on common data size + "config_context", # does not malloc + "get_config", # does not malloc + "set_config", # does not malloc + "SVC(probability=True)", # memory leak fortran numpy (investigate _fit_proba) + "NuSVC(probability=True)", # memory leak fortran numpy (investigate _fit_proba) "IncrementalEmpiricalCovariance", # dataframe_f issues "IncrementalLinearRegression", # TODO fix memory leak issue in private CI for data_shape = (1000, 100), data_transform_function = dataframe_f "IncrementalPCA", # TODO fix memory leak issue in private CI for data_shape = (1000, 100), data_transform_function = dataframe_f - "TSNE", # too slow for using in testing on common data size + "LogisticRegression(solver='newton-cg')", # memory leak fortran (1000, 100) ) -estimators = [ - TrainTestSplitEstimator, - FiniteCheckEstimator, - CosineDistancesEstimator, - CorrelationDistancesEstimator, - RocAucEstimator, -] -get_patched_estimators(BANNED_ESTIMATORS, estimators) -estimators = remove_duplicated_estimators(estimators) +GPU_SKIP_LIST = ( + "TSNE", # too slow for using in testing on common data size + "RandomForestRegressor", # too slow for using in testing on common data size + "KMeans", # does not support GPU offloading + "config_context", # does not malloc + "get_config", # does not malloc + "set_config", # does not malloc + "Ridge", # does not support GPU offloading (fails silently) + "ElasticNet", # does not support GPU offloading (fails silently) + "Lasso", # does not support GPU offloading (fails silently) + "SVR", # does not support GPU offloading (fails silently) + "NuSVR", # does not support GPU offloading (fails silently) + "NuSVC", # does not support GPU offloading (fails silently) + "LogisticRegression", # default parameters not supported, see solver=newton-cg + "NuSVC(probability=True)", # does not support GPU offloading (fails silently) + "IncrementalLinearRegression", # issue with potrf with the specific dataset + "LinearRegression", # issue with potrf with the specific dataset +) -def ndarray_c(x, y): - return np.ascontiguousarray(x), y +def gen_functions(functions): + func_dict = functions.copy() -def ndarray_f(x, y): - return np.asfortranarray(x), y + roc_auc_score = func_dict.pop("roc_auc_score") + func_dict["roc_auc_score"] = lambda x, y: roc_auc_score(y, y) + pairwise_distances = func_dict.pop("pairwise_distances") + func_dict["pairwise_distances(metric='cosine')"] = lambda x, y: pairwise_distances( + x, metric="cosine" + ) + func_dict["pairwise_distances(metric='correlation')"] = ( + lambda x, y: pairwise_distances(x, metric="correlation") + ) -def dataframe_c(x, y): - return pd.DataFrame(np.ascontiguousarray(x)), pd.Series(y) + _assert_all_finite = func_dict.pop("_assert_all_finite") + func_dict["_assert_all_finite"] = lambda x, y: [ + _assert_all_finite(x), + _assert_all_finite(y), + ] + return func_dict -def dataframe_f(x, y): - return pd.DataFrame(np.asfortranarray(x)), pd.Series(y) +FUNCTIONS = gen_functions(PATCHED_FUNCTIONS) +CPU_ESTIMATORS = { + k: v + for k, v in {**PATCHED_MODELS, **SPECIAL_INSTANCES, **FUNCTIONS}.items() + if not k in CPU_SKIP_LIST +} -data_transforms = [ndarray_c, ndarray_f, dataframe_c, dataframe_f] +GPU_ESTIMATORS = { + k: v + for k, v in {**PATCHED_MODELS, **SPECIAL_INSTANCES}.items() + if not k in GPU_SKIP_LIST +} -data_shapes = [(1000, 100), (2000, 50)] +data_shapes = [ + pytest.param((1000, 100), id="(1000, 100)"), + pytest.param((2000, 50), id="(2000, 50)"), +] EXTRA_MEMORY_THRESHOLD = 0.15 N_SPLITS = 10 +ORDER_DICT = {"F": np.asfortranarray, "C": np.ascontiguousarray} def gen_clsf_data(n_samples, n_features): @@ -148,45 +134,82 @@ def gen_clsf_data(n_samples, n_features): ) -def split_train_inference(kf, x, y, estimator): +def get_traced_memory(queue=None): + if _is_dpc_backend and queue and queue.sycl_device.is_gpu: + return _backend.get_used_memory(queue) + else: + return tracemalloc.get_traced_memory()[0] + + +def take(x, index, axis=0, queue=None): + xp, array_api = get_namespace(x) + if array_api: + return xp.take(x, xp.asarray(index, device=queue), axis=axis) + else: + return x.take(index, axis=axis) + + +def split_train_inference(kf, x, y, estimator, queue=None): mem_tracks = [] for train_index, test_index in kf.split(x): - if isinstance(x, np.ndarray): - x_train, x_test = x[train_index], x[test_index] - y_train, y_test = y[train_index], y[test_index] - elif isinstance(x, pd.core.frame.DataFrame): - x_train, x_test = x.iloc[train_index], x.iloc[test_index] - y_train, y_test = y.iloc[train_index], y.iloc[test_index] - # TODO: add parameters for all estimators to prevent - # fallback to stock scikit-learn with default parameters - - alg = estimator() - alg.fit(x_train, y_train) - if hasattr(alg, "predict"): - alg.predict(x_test) - elif hasattr(alg, "transform"): - alg.transform(x_test) - elif hasattr(alg, "kneighbors"): - alg.kneighbors(x_test) - del alg, x_train, x_test, y_train, y_test - mem_tracks.append(tracemalloc.get_traced_memory()[0]) + x_train = take(x, train_index, queue=queue) + y_train = take(y, train_index, queue=queue) + x_test = take(x, test_index, queue=queue) + y_test = take(y, test_index, queue=queue) + + if isclass(estimator) and issubclass(estimator, BaseEstimator): + alg = estimator() + flag = True + elif isinstance(estimator, BaseEstimator): + alg = clone(estimator) + flag = True + else: + flag = False + + if flag: + alg.fit(x_train, y_train) + if hasattr(alg, "predict"): + alg.predict(x_test) + elif hasattr(alg, "transform"): + alg.transform(x_test) + elif hasattr(alg, "kneighbors"): + alg.kneighbors(x_test) + del alg + else: + estimator(x_train, y_train) + + del x_train, x_test, y_train, y_test, flag + mem_tracks.append(get_traced_memory(queue)) return mem_tracks -def _kfold_function_template(estimator, data_transform_function, data_shape): +def _kfold_function_template(estimator, dataframe, data_shape, queue=None, func=None): tracemalloc.start() n_samples, n_features = data_shape - x, y, data_memory_size = gen_clsf_data(n_samples, n_features) + X, y, data_memory_size = gen_clsf_data(n_samples, n_features) kf = KFold(n_splits=N_SPLITS) - x, y = data_transform_function(x, y) + if func: + X = func(X) + + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + y = _convert_to_dataframe(y, sycl_queue=queue, target_df=dataframe) - mem_before, _ = tracemalloc.get_traced_memory() - mem_tracks = split_train_inference(kf, x, y, estimator) + mem_before = get_traced_memory(queue) + mem_tracks = split_train_inference(kf, X, y, estimator, queue=queue) mem_iter_diffs = np.array(mem_tracks[1:]) - np.array(mem_tracks[:-1]) mem_incr_mean, mem_incr_std = mem_iter_diffs.mean(), mem_iter_diffs.std() mem_incr_mean, mem_incr_std = round(mem_incr_mean), round(mem_incr_std) - mem_iter_corr, _ = pearsonr(mem_tracks, list(range(len(mem_tracks)))) + with warnings.catch_warnings(): + # In the case that the memory usage is constant, this will raise + # a ConstantInputWarning error in pearsonr from scipy, this can + # be ignored. + warnings.filterwarnings( + "ignore", + message="An input array is constant; the correlation coefficient is not defined", + ) + mem_iter_corr, _ = pearsonr(mem_tracks, list(range(len(mem_tracks)))) + if mem_iter_corr > 0.95: logging.warning( "Memory usage is steadily increasing with iterations " @@ -195,12 +218,17 @@ def _kfold_function_template(estimator, data_transform_function, data_shape): "Memory usage increase per iteration: " f"{mem_incr_mean}±{mem_incr_std} bytes" ) - mem_before_gc, _ = tracemalloc.get_traced_memory() + mem_before_gc = get_traced_memory(queue) mem_diff = mem_before_gc - mem_before + if isinstance(estimator, BaseEstimator): + name = str(estimator) + else: + name = estimator.__name__ + message = ( "Size of extra allocated memory {} using garbage collector " f"is greater than {EXTRA_MEMORY_THRESHOLD * 100}% of input data" - f"\n\tAlgorithm: {estimator.__name__}" + f"\n\tAlgorithm: {name}" f"\n\tInput data size: {data_memory_size} bytes" "\n\tExtra allocated memory size: {} bytes" " / {} %" @@ -212,21 +240,51 @@ def _kfold_function_template(estimator, data_transform_function, data_shape): ) ) gc.collect() - mem_after, _ = tracemalloc.get_traced_memory() + mem_after = get_traced_memory(queue) tracemalloc.stop() mem_diff = mem_after - mem_before - assert mem_diff < EXTRA_MEMORY_THRESHOLD * data_memory_size, message.format( - "after", mem_diff, round((mem_diff) / data_memory_size * 100, 2) - ) + # GPU offloading with SYCL contains a program/kernel cache which should + # be controllable via a KernelProgramCache object in the SYCL context. + # The programs and kernels are stored on the GPU, but cannot be cleared + # as this class is not available for access in all oneDAL DPC++ runtimes. + # Therefore, until this is implemented this test must be skipped for gpu + # as it looks like a memory leak (at least there is no way to discern a + # leak on the first run). + if queue is None or queue.sycl_device.is_cpu: + assert mem_diff < EXTRA_MEMORY_THRESHOLD * data_memory_size, message.format( + "after", mem_diff, round((mem_diff) / data_memory_size * 100, 2) + ) -# disable fallback check as logging impacts memory use +@pytest.mark.parametrize("order", ["F", "C"]) +@pytest.mark.parametrize( + "dataframe,queue", get_dataframes_and_queues("numpy,pandas,dpctl", "cpu") +) +@pytest.mark.parametrize("estimator", CPU_ESTIMATORS.keys()) +@pytest.mark.parametrize("data_shape", data_shapes) +def test_memory_leaks(estimator, dataframe, queue, order, data_shape): + func = ORDER_DICT[order] + if estimator == "_assert_all_finite" and queue is not None: + pytest.skip(f"{estimator} is not designed for device offloading") + + _kfold_function_template( + CPU_ESTIMATORS[estimator], dataframe, data_shape, queue, func + ) -@pytest.mark.allow_sklearn_fallback -@pytest.mark.parametrize("data_transform_function", data_transforms) -@pytest.mark.parametrize("estimator", estimators) +@pytest.mark.skipif( + os.getenv("ZES_ENABLE_SYSMAN") is None or not is_dpctl_available("gpu"), + reason="SYCL device memory leak check requires the level zero sysman", +) +@pytest.mark.parametrize("queue", get_queues("gpu")) +@pytest.mark.parametrize("estimator", GPU_ESTIMATORS.keys()) +@pytest.mark.parametrize("order", ["F", "C"]) @pytest.mark.parametrize("data_shape", data_shapes) -def test_memory_leaks(estimator, data_transform_function, data_shape): - _kfold_function_template(estimator, data_transform_function, data_shape) +def test_gpu_memory_leaks(estimator, queue, order, data_shape): + func = ORDER_DICT[order] + if "ExtraTrees" in estimator and data_shape == (2000, 50): + pytest.skip("Avoid a segmentation fault in Extra Trees algorithms") + + with config_context(target_offload=queue): + _kfold_function_template(GPU_ESTIMATORS[estimator], None, data_shape, queue, func) diff --git a/sklearnex/utils/_namespace.py b/sklearnex/utils/_namespace.py index 9eeaa2c5cb..2f67737023 100644 --- a/sklearnex/utils/_namespace.py +++ b/sklearnex/utils/_namespace.py @@ -94,4 +94,4 @@ def get_namespace(*arrays): elif sklearn_check_version("1.2"): return sklearn_get_namespace(*arrays) else: - return np, True + return np, False From bd8018e829bc39e04a8ff422ef3ad34865eec637 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Fri, 7 Jun 2024 08:54:47 -0700 Subject: [PATCH 14/75] Update dependency tornado to v6.4.1 [SECURITY] (#1857) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- requirements-doc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-doc.txt b/requirements-doc.txt index 07bd79512e..d818c1f1ff 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -65,7 +65,7 @@ sphinxcontrib-jsmath==1.0.1 sphinxcontrib-qthelp==1.0.3 sphinxcontrib-serializinghtml==1.1.5 testpath==0.6.0 -tornado==6.4 +tornado==6.4.1 traitlets==5.14.1 typing-extensions==4.9.0 urllib3==2.2.0 From 4822bd74f9624e189e836e3c12d84fd13d228a5a Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 10 Jun 2024 11:19:35 +0200 Subject: [PATCH 15/75] [bug] fix issues with dpnp/dpctl regressor score method (#1855) * Update _ridge.py * Update _coordinate_descent.py * Update _forest.py * Update incremental_linear.py * Update linear.py * Update _forest.py * Update _common.py * Update _common.py * Update svr.py * Update nusvr.py * Update knn_regression.py * formatting * isort * fix score naming issue * Update linear.py * Update linear.py * Update linear.py * Update run_test.sh * Update run_test.bat * Update run_test.bat * Update run_test.sh --- .../linear_model/_coordinate_descent.py | 6 ++ daal4py/sklearn/linear_model/_ridge.py | 3 + sklearnex/ensemble/_forest.py | 26 +++++++- sklearnex/linear_model/incremental_linear.py | 66 +++++++++++++++++++ sklearnex/linear_model/linear.py | 32 +++++++-- sklearnex/neighbors/knn_regression.py | 24 +++++++ sklearnex/svm/_common.py | 8 ++- sklearnex/svm/nusvr.py | 17 +++++ sklearnex/svm/svr.py | 17 +++++ 9 files changed, 188 insertions(+), 11 deletions(-) diff --git a/daal4py/sklearn/linear_model/_coordinate_descent.py b/daal4py/sklearn/linear_model/_coordinate_descent.py index 5874b15733..a35baade57 100755 --- a/daal4py/sklearn/linear_model/_coordinate_descent.py +++ b/daal4py/sklearn/linear_model/_coordinate_descent.py @@ -734,8 +734,11 @@ def dual_gap_(self, value): def dual_gap_(self): self._gap = None + score = support_usm_ndarray()(ElasticNet_original.score) + fit.__doc__ = ElasticNet_original.fit.__doc__ predict.__doc__ = ElasticNet_original.predict.__doc__ + score.__doc__ = ElasticNet_original.score.__doc__ @control_n_jobs(decorated_methods=["fit", "predict"]) @@ -848,5 +851,8 @@ def dual_gap_(self, value): def dual_gap_(self): self._gap = None + score = support_usm_ndarray()(Lasso_original.score) + fit.__doc__ = Lasso_original.fit.__doc__ predict.__doc__ = Lasso_original.predict.__doc__ + score.__doc__ = Lasso_original.score.__doc__ diff --git a/daal4py/sklearn/linear_model/_ridge.py b/daal4py/sklearn/linear_model/_ridge.py index e3a8a82287..7a49938013 100644 --- a/daal4py/sklearn/linear_model/_ridge.py +++ b/daal4py/sklearn/linear_model/_ridge.py @@ -306,5 +306,8 @@ def fit(self, X, y, sample_weight=None): def predict(self, X): return _predict_ridge(self, X) + score = support_usm_ndarray()(Ridge_original.score) + fit.__doc__ = Ridge_original.fit.__doc__ predict.__doc__ = Ridge_original.predict.__doc__ + score.__doc__ = Ridge_original.score.__doc__ diff --git a/sklearnex/ensemble/_forest.py b/sklearnex/ensemble/_forest.py index 36265d8a86..447a233dcd 100644 --- a/sklearnex/ensemble/_forest.py +++ b/sklearnex/ensemble/_forest.py @@ -29,7 +29,7 @@ from sklearn.ensemble._forest import ForestRegressor as sklearn_ForestRegressor from sklearn.ensemble._forest import _get_n_samples_bootstrap from sklearn.exceptions import DataConversionWarning -from sklearn.metrics import accuracy_score +from sklearn.metrics import accuracy_score, r2_score from sklearn.tree import ( DecisionTreeClassifier, DecisionTreeRegressor, @@ -1037,7 +1037,7 @@ def _onedal_cpu_supported(self, method_name, *data): ] ) - elif method_name == "predict": + elif method_name in ["predict", "score"]: X = data[0] patching_status.and_conditions( @@ -1091,7 +1091,7 @@ def _onedal_gpu_supported(self, method_name, *data): ] ) - elif method_name == "predict": + elif method_name in ["predict", "score"]: X = data[0] patching_status.and_conditions( @@ -1134,6 +1134,11 @@ def _onedal_predict(self, X, queue=None): return self._onedal_estimator.predict(X, queue=queue) + def _onedal_score(self, X, y, sample_weight=None, queue=None): + return r2_score( + y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight + ) + def fit(self, X, y, sample_weight=None): dispatch( self, @@ -1160,8 +1165,23 @@ def predict(self, X): X, ) + @wrap_output_data + def score(self, X, y, sample_weight=None): + return dispatch( + self, + "score", + { + "onedal": self.__class__._onedal_score, + "sklearn": sklearn_ForestRegressor.score, + }, + X, + y, + sample_weight=sample_weight, + ) + fit.__doc__ = sklearn_ForestRegressor.fit.__doc__ predict.__doc__ = sklearn_ForestRegressor.predict.__doc__ + score.__doc__ = sklearn_ForestRegressor.score.__doc__ @control_n_jobs(decorated_methods=["fit", "predict", "predict_proba", "score"]) diff --git a/sklearnex/linear_model/incremental_linear.py b/sklearnex/linear_model/incremental_linear.py index 2f9468f8a5..2a56c71072 100644 --- a/sklearnex/linear_model/incremental_linear.py +++ b/sklearnex/linear_model/incremental_linear.py @@ -20,6 +20,7 @@ import numpy as np from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin from sklearn.exceptions import NotFittedError +from sklearn.metrics import r2_score from sklearn.utils import check_array, gen_batches from daal4py.sklearn._n_jobs_support import control_n_jobs @@ -147,6 +148,11 @@ def _onedal_predict(self, X, queue=None): self._onedal_finalize_fit() return self._onedal_estimator.predict(X, queue) + def _onedal_score(self, X, y, sample_weight=None, queue=None): + return r2_score( + y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight + ) + def _onedal_partial_fit(self, X, y, queue=None): first_pass = not hasattr(self, "n_samples_seen_") or self.n_samples_seen_ == 0 @@ -385,3 +391,63 @@ def predict(self, X, y=None): }, X, ) + + @wrap_output_data + def score(self, X, y, sample_weight=None): + """Return the coefficient of determination of the prediction. + + The coefficient of determination :math:`R^2` is defined as + :math:`(1 - \\frac{u}{v})`, where :math:`u` is the residual + sum of squares ``((y_true - y_pred)** 2).sum()`` and :math:`v` + is the total sum of squares ``((y_true - y_true.mean()) ** 2).sum()``. + The best possible score is 1.0 and it can be negative (because the + model can be arbitrarily worse). A constant model that always predicts + the expected value of `y`, disregarding the input features, would get + a :math:`R^2` score of 0.0. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Test samples. For some estimators this may be a precomputed + kernel matrix or a list of generic objects instead with shape + ``(n_samples, n_samples_fitted)``, where ``n_samples_fitted`` + is the number of samples used in the fitting for the estimator. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + True values for `X`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : float + :math:`R^2` of ``self.predict(X)`` w.r.t. `y`. + + Notes + ----- + The :math:`R^2` score used when calling ``score`` on a regressor uses + ``multioutput='uniform_average'`` from version 0.23 to keep consistent + with default value of :func:`~sklearn.metrics.r2_score`. + This influences the ``score`` method of all the multioutput + regressors (except for + :class:`~sklearn.multioutput.MultiOutputRegressor`). + """ + if not hasattr(self, "coef_"): + msg = ( + "This %(name)s instance is not fitted yet. Call 'fit' or 'partial_fit' " + "with appropriate arguments before using this estimator." + ) + raise NotFittedError(msg % {"name": self.__class__.__name__}) + + return dispatch( + self, + "score", + { + "onedal": self.__class__._onedal_score, + "sklearn": None, + }, + X, + y, + sample_weight=sample_weight, + ) diff --git a/sklearnex/linear_model/linear.py b/sklearnex/linear_model/linear.py index 76dc02d2f1..83f93b40c9 100644 --- a/sklearnex/linear_model/linear.py +++ b/sklearnex/linear_model/linear.py @@ -20,6 +20,7 @@ import numpy as np from sklearn.exceptions import NotFittedError from sklearn.linear_model import LinearRegression as sklearn_LinearRegression +from sklearn.metrics import r2_score from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version @@ -123,6 +124,20 @@ def predict(self, X): X, ) + @wrap_output_data + def score(self, X, y, sample_weight=None): + return dispatch( + self, + "score", + { + "onedal": self.__class__._onedal_score, + "sklearn": sklearn_LinearRegression.score, + }, + X, + y, + sample_weight=sample_weight, + ) + def _test_type_and_finiteness(self, X_in): X = X_in if isinstance(X_in, np.ndarray) else np.asarray(X_in) @@ -193,22 +208,19 @@ def _onedal_fit_supported(self, method_name, *data): return patching_status def _onedal_predict_supported(self, method_name, *data): - assert method_name == "predict" - assert len(data) == 1 - class_name = self.__class__.__name__ patching_status = PatchingConditionsChain( f"sklearn.linear_model.{class_name}.predict" ) - n_samples = _num_samples(*data) + n_samples = _num_samples(data[0]) model_is_sparse = issparse(self.coef_) or ( self.fit_intercept and issparse(self.intercept_) ) dal_ready = patching_status.and_conditions( [ (n_samples > 0, "Number of samples is less than 1."), - (not issparse(*data), "Sparse input is not supported."), + (not issparse(data[0]), "Sparse input is not supported."), (not model_is_sparse, "Sparse coefficients are not supported."), ] ) @@ -216,7 +228,7 @@ def _onedal_predict_supported(self, method_name, *data): return patching_status patching_status.and_condition( - self._test_type_and_finiteness(*data), "Input X is not supported." + self._test_type_and_finiteness(data[0]), "Input X is not supported." ) return patching_status @@ -224,7 +236,7 @@ def _onedal_predict_supported(self, method_name, *data): def _onedal_supported(self, method_name, *data): if method_name == "fit": return self._onedal_fit_supported(method_name, *data) - if method_name == "predict": + if method_name in ["predict", "score"]: return self._onedal_predict_supported(method_name, *data) raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") @@ -286,6 +298,11 @@ def _onedal_predict(self, X, queue=None): res = self._onedal_estimator.predict(X, queue=queue) return res + def _onedal_score(self, X, y, sample_weight=None, queue=None): + return r2_score( + y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight + ) + def get_coef_(self): return self.coef_ @@ -314,3 +331,4 @@ def _save_attributes(self): fit.__doc__ = sklearn_LinearRegression.fit.__doc__ predict.__doc__ = sklearn_LinearRegression.predict.__doc__ + score.__doc__ = sklearn_LinearRegression.score.__doc__ diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index bd271f2b59..798883b4e0 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -14,6 +14,7 @@ # limitations under the License. # ============================================================================== +from sklearn.metrics import r2_score from sklearn.neighbors._regression import ( KNeighborsRegressor as sklearn_KNeighborsRegressor, ) @@ -117,6 +118,23 @@ def predict(self, X): X, ) + @wrap_output_data + def score(self, X, y, sample_weight=None): + check_is_fitted(self) + if sklearn_check_version("1.0"): + self._check_feature_names(X, reset=False) + return dispatch( + self, + "score", + { + "onedal": self.__class__._onedal_score, + "sklearn": sklearn_KNeighborsRegressor.score, + }, + X, + y, + sample_weight=sample_weight, + ) + @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) @@ -184,6 +202,11 @@ def _onedal_kneighbors( X, n_neighbors, return_distance, queue=queue ) + def _onedal_score(self, X, y, sample_weight=None, queue=None): + return r2_score( + y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight + ) + def _save_attributes(self): self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ @@ -196,3 +219,4 @@ def _save_attributes(self): predict.__doc__ = sklearn_KNeighborsRegressor.predict.__doc__ kneighbors.__doc__ = sklearn_KNeighborsRegressor.kneighbors.__doc__ radius_neighbors.__doc__ = sklearn_NearestNeighbors.radius_neighbors.__doc__ + score.__doc__ = sklearn_KNeighborsRegressor.score.__doc__ diff --git a/sklearnex/svm/_common.py b/sklearnex/svm/_common.py index 719464e02f..e0e7d8c939 100644 --- a/sklearnex/svm/_common.py +++ b/sklearnex/svm/_common.py @@ -21,6 +21,7 @@ from scipy import sparse as sp from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.calibration import CalibratedClassifierCV +from sklearn.metrics import r2_score from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import LabelEncoder @@ -79,7 +80,7 @@ def _onedal_cpu_supported(self, method_name, *data): ) return patching_status inference_methods = ( - ["predict"] + ["predict", "score"] if class_name.endswith("R") else ["predict", "predict_proba", "decision_function", "score"] ) @@ -322,3 +323,8 @@ def _save_attributes(self): if sklearn_check_version("1.1"): self.n_iter_ = self._onedal_estimator.n_iter_ + + def _onedal_score(self, X, y, sample_weight=None, queue=None): + return r2_score( + y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight + ) diff --git a/sklearnex/svm/nusvr.py b/sklearnex/svm/nusvr.py index 36bf1fb206..5c2c1a1dee 100644 --- a/sklearnex/svm/nusvr.py +++ b/sklearnex/svm/nusvr.py @@ -94,6 +94,22 @@ def predict(self, X): X, ) + @wrap_output_data + def score(self, X, y, sample_weight=None): + if sklearn_check_version("1.0"): + self._check_feature_names(X, reset=False) + return dispatch( + self, + "score", + { + "onedal": self.__class__._onedal_score, + "sklearn": sklearn_NuSVR.score, + }, + X, + y, + sample_weight=sample_weight, + ) + def _onedal_fit(self, X, y, sample_weight=None, queue=None): X, _, sample_weight = self._onedal_fit_checks(X, y, sample_weight) onedal_params = { @@ -118,3 +134,4 @@ def _onedal_predict(self, X, queue=None): fit.__doc__ = sklearn_NuSVR.fit.__doc__ predict.__doc__ = sklearn_NuSVR.predict.__doc__ + score.__doc__ = sklearn_NuSVR.score.__doc__ diff --git a/sklearnex/svm/svr.py b/sklearnex/svm/svr.py index 830d7304ca..ed6c5baa23 100644 --- a/sklearnex/svm/svr.py +++ b/sklearnex/svm/svr.py @@ -95,6 +95,22 @@ def predict(self, X): X, ) + @wrap_output_data + def score(self, X, y, sample_weight=None): + if sklearn_check_version("1.0"): + self._check_feature_names(X, reset=False) + return dispatch( + self, + "score", + { + "onedal": self.__class__._onedal_score, + "sklearn": sklearn_SVR.score, + }, + X, + y, + sample_weight=sample_weight, + ) + def _onedal_fit(self, X, y, sample_weight=None, queue=None): X, _, sample_weight = self._onedal_fit_checks(X, y, sample_weight) onedal_params = { @@ -119,3 +135,4 @@ def _onedal_predict(self, X, queue=None): fit.__doc__ = sklearn_SVR.fit.__doc__ predict.__doc__ = sklearn_SVR.predict.__doc__ + score.__doc__ = sklearn_SVR.score.__doc__ From 40b73c1c5617d8386f418d16348b0793721f8247 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 10 Jun 2024 13:00:07 +0200 Subject: [PATCH 16/75] [enhancement] refactor test_run_to_run_stability_tests (#1827) * Update test_run_to_run_stability_tests.py * partially compelte changeover * formatting * change in test patching * forgotten import * add simplification * refactor undersay * interim * sparse * functional * formatting * Update test_run_to_run_stability.py * formatting * remove sparse * reformatting * fix issues when method=None * fixes * set flag * fix issues with logisticRegression debugging * remove TO_SKIP and rerun * Update _utils.py * Update test_n_jobs_support.py * Update test_run_to_run_stability.py * Update test_run_to_run_stability.py * Update logistic_path.py * Update logistic_regression.py * Update test_n_jobs_support.py * Update logistic_regression.py * Update logistic_regression.py * Update test_run_to_run_stability.py * Update logistic_path.py * Update logistic_regression.py * formatting * Update logistic_regression.py * Update logistic_path.py * Update logistic_path.py * Update logistic_path.py * fix for deprecated * missing comma --- daal4py/sklearn/linear_model/logistic_path.py | 31 +- sklearnex/linear_model/logistic_regression.py | 4 +- sklearnex/tests/_utils.py | 66 ++- sklearnex/tests/test_patching.py | 17 +- sklearnex/tests/test_run_to_run_stability.py | 283 ++++++++++++ .../tests/test_run_to_run_stability_tests.py | 428 ------------------ 6 files changed, 355 insertions(+), 474 deletions(-) create mode 100755 sklearnex/tests/test_run_to_run_stability.py delete mode 100755 sklearnex/tests/test_run_to_run_stability_tests.py diff --git a/daal4py/sklearn/linear_model/logistic_path.py b/daal4py/sklearn/linear_model/logistic_path.py index 20578428c9..8ab97ea9d2 100755 --- a/daal4py/sklearn/linear_model/logistic_path.py +++ b/daal4py/sklearn/linear_model/logistic_path.py @@ -808,21 +808,22 @@ def daal4py_predict(self, X, resultsToEvaluate): _patching_status = PatchingConditionsChain( f"sklearn.linear_model.LogisticRegression.{_function_name}" ) - _patching_status.and_conditions( - [ - ( - self.multi_class in ["multinomial", "warn"], - f"{self.multi_class} multiclass option is not supported. " - "Only 'multinomial' or 'warn' options are supported.", - ), - (self.classes_.size == 2, "Number of classes != 2."), - ( - resultsToEvaluate == "computeClassLabels", - "resultsToEvaluate != 'computeClassLabels'.", - ), - ], - conditions_merging=any, - ) + if _function_name != "predict": + _patching_status.and_conditions( + [ + ( + self.classes_.size == 2 + or logistic_module._check_multi_class( + self.multi_class if self.multi_class != "deprecated" else "auto", + self.solver, + self.classes_.size, + ) + != "ovr", + f"selected multiclass option is not supported for n_classes > 2.", + ), + ], + ) + _dal_ready = _patching_status.and_conditions( [ (not sparse.issparse(X), "X is sparse. Sparse input is not supported."), diff --git a/sklearnex/linear_model/logistic_regression.py b/sklearnex/linear_model/logistic_regression.py index 4495877f23..9dbc95feb0 100644 --- a/sklearnex/linear_model/logistic_regression.py +++ b/sklearnex/linear_model/logistic_regression.py @@ -146,7 +146,7 @@ def predict_proba(self, X): self._check_feature_names(X, reset=False) return dispatch( self, - "predict", + "predict_proba", { "onedal": self.__class__._onedal_predict_proba, "sklearn": sklearn_LogisticRegression.predict_proba, @@ -160,7 +160,7 @@ def predict_log_proba(self, X): self._check_feature_names(X, reset=False) return dispatch( self, - "predict", + "predict_log_proba", { "onedal": self.__class__._onedal_predict_log_proba, "sklearn": sklearn_LogisticRegression.predict_log_proba, diff --git a/sklearnex/tests/_utils.py b/sklearnex/tests/_utils.py index 1f96080acf..3301f37674 100755 --- a/sklearnex/tests/_utils.py +++ b/sklearnex/tests/_utils.py @@ -14,9 +14,11 @@ # limitations under the License. # ============================================================================== +from functools import partial from inspect import isclass import numpy as np +from scipy import sparse as sp from sklearn import clone from sklearn.base import ( BaseEstimator, @@ -116,8 +118,8 @@ def gen_models_info(algorithms): if i in PATCHED_MODELS: est = PATCHED_MODELS[i] - elif i in SPECIAL_INSTANCES: - est = SPECIAL_INSTANCES[i].__class__ + elif isinstance(algorithms[i], BaseEstimator): + est = algorithms[i].__class__ else: raise KeyError(f"Unrecognized sklearnex estimator: {i}") @@ -138,24 +140,54 @@ def gen_models_info(algorithms): return output -def gen_dataset(estimator, queue=None, target_df=None, dtype=np.float64): - dataset = None - name = estimator.__class__.__name__ - est = PATCHED_MODELS[name] +def gen_dataset_type(est): + # est should be an estimator or estimator class + # dataset initialized to classification, but will be swapped + # for other types as necessary + dataset = "classification" + estimator = est.__class__ if isinstance(est, BaseEstimator) else est + for mixin, _, data in mixin_map: - if issubclass(est, mixin) and data is not None: + if issubclass(estimator, mixin) and data is not None: dataset = data + return dataset + + +_dataset_dict = { + "classification": [partial(load_iris, return_X_y=True)], + "regression": [partial(load_diabetes, return_X_y=True)], +} + + +def gen_dataset( + est, + datasets=_dataset_dict, + sparse=False, + queue=None, + target_df=None, + dtype=None, +): + dataset_type = gen_dataset_type(est) + output = [] # load data - if dataset == "classification" or dataset is None: - X, y = load_iris(return_X_y=True) - elif dataset == "regression": - X, y = load_diabetes(return_X_y=True) - else: - raise ValueError("Unknown dataset type") - - X = _convert_to_dataframe(X, sycl_queue=queue, target_df=target_df, dtype=dtype) - y = _convert_to_dataframe(y, sycl_queue=queue, target_df=target_df, dtype=dtype) - return X, y + flag = dtype is None + + for func in datasets[dataset_type]: + X, y = func() + if flag: + dtype = X.dtype if hasattr(X, "dtype") else np.float64 + + if sparse: + X = sp.csr_matrix(X) + else: + X = _convert_to_dataframe( + X, sycl_queue=queue, target_df=target_df, dtype=dtype + ) + y = _convert_to_dataframe( + y, sycl_queue=queue, target_df=target_df, dtype=dtype + ) + output += [[X, y]] + return output DTYPES = [ diff --git a/sklearnex/tests/test_patching.py b/sklearnex/tests/test_patching.py index 2271c32459..9c2f6cf5c9 100755 --- a/sklearnex/tests/test_patching.py +++ b/sklearnex/tests/test_patching.py @@ -148,23 +148,16 @@ def test_standard_estimator_patching(caplog, dataframe, queue, dtype, estimator, and dtype in [np.uint32, np.uint64] ): pytest.skip("Windows segmentation fault for Ridge.predict for unsigned ints") - elif estimator == "IncrementalLinearRegression" and dtype in [ - np.int8, - np.int16, - np.int32, - np.int64, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - ]: + elif estimator == "IncrementalLinearRegression" and np.issubdtype( + dtype, np.integer + ): pytest.skip( "IncrementalLinearRegression fails on oneDAL side with int types because dataset is filled by zeroes" ) elif method and not hasattr(est, method): pytest.skip(f"sklearn available_if prevents testing {estimator}.{method}") - X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype) + X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] est.fit(X, y) if method: @@ -196,7 +189,7 @@ def test_special_estimator_patching(caplog, dataframe, queue, dtype, estimator, elif dtype == np.float64 and queue and not queue.sycl_device.has_aspect_fp64: pytest.skip("Hardware does not support fp64 SYCL testing") - X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype) + X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] est.fit(X, y) if method and not hasattr(est, method): diff --git a/sklearnex/tests/test_run_to_run_stability.py b/sklearnex/tests/test_run_to_run_stability.py new file mode 100755 index 0000000000..be8e28da92 --- /dev/null +++ b/sklearnex/tests/test_run_to_run_stability.py @@ -0,0 +1,283 @@ +# =============================================================================== +# Copyright 2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import random +from collections.abc import Iterable +from functools import partial +from numbers import Number + +import numpy as np +import pytest +from _utils import ( + PATCHED_MODELS, + SPECIAL_INSTANCES, + _sklearn_clone_dict, + gen_dataset, + gen_models_info, +) +from numpy.testing import assert_allclose +from scipy import sparse +from sklearn.datasets import ( + load_breast_cancer, + load_diabetes, + load_iris, + make_classification, + make_regression, +) + +import daal4py as d4p +from onedal.tests.utils._dataframes_support import _as_numpy, get_dataframes_and_queues +from sklearnex.cluster import DBSCAN, KMeans +from sklearnex.decomposition import PCA +from sklearnex.metrics import pairwise_distances, roc_auc_score +from sklearnex.model_selection import train_test_split +from sklearnex.neighbors import ( + KNeighborsClassifier, + KNeighborsRegressor, + NearestNeighbors, +) +from sklearnex.svm import SVC + +# to reproduce errors even in CI +d4p.daalinit(nthreads=100) + +_dataset_dict = { + "classification": [ + partial(load_iris, return_X_y=True), + partial(load_breast_cancer, return_X_y=True), + ], + "regression": [ + partial(load_diabetes, return_X_y=True), + partial( + make_regression, n_samples=500, n_features=10, noise=64.0, random_state=42 + ), + ], +} + + +def eval_method(X, y, est, method): + res = [] + est.fit(X, y) + + if method: + if method != "score": + res = getattr(est, method)(X) + else: + res = est.score(X, y) + + if not isinstance(res, Iterable): + res = [res] + + # if estimator follows sklearn design rules, then set attributes should have a + # trailing underscore + attributes = [ + i + for i in dir(est) + if hasattr(est, i) and not i.startswith("_") and i.endswith("_") + ] + results = [getattr(est, i) for i in attributes] + [_as_numpy(i) for i in res] + attributes += [method for i in res] + return results, attributes + + +def _run_test(estimator, method, datasets): + + for X, y in datasets: + baseline, attributes = eval_method(X, y, estimator, method) + + for i in range(10): + res, _ = eval_method(X, y, estimator, method) + + for r, b, n in zip(res, baseline, attributes): + if ( + isinstance(b, Number) + or hasattr(b, "__array__") + or hasattr(b, "__array_namespace__") + or hasattr(b, "__sycl_usm_ndarray__") + ): + assert_allclose( + r, b, rtol=0.0, atol=0.0, err_msg=str(n + " is incorrect") + ) + + +SPARSE_INSTANCES = _sklearn_clone_dict( + { + str(i): i + for i in [ + SVC(), + KMeans(), + KMeans(init="random"), + ] + } +) + +STABILITY_INSTANCES = _sklearn_clone_dict( + { + str(i): i + for i in [ + KNeighborsClassifier(algorithm="brute", weights="distance"), + KNeighborsClassifier(algorithm="kd_tree", weights="distance"), + KNeighborsClassifier(algorithm="kd_tree"), + KNeighborsRegressor(algorithm="brute", weights="distance"), + KNeighborsRegressor(algorithm="kd_tree", weights="distance"), + KNeighborsRegressor(algorithm="kd_tree"), + NearestNeighbors(algorithm="kd_tree"), + DBSCAN(algorithm="brute"), + PCA(n_components=0.5, svd_solver="covariance_eigh"), + KMeans(init="random"), + ] + } +) + + +@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues("numpy")) +@pytest.mark.parametrize("estimator, method", gen_models_info(PATCHED_MODELS)) +def test_standard_estimator_stability(estimator, method, dataframe, queue): + if estimator in ["LogisticRegression", "TSNE"]: + pytest.skip(f"stability not guaranteed for {estimator}") + if "KMeans" in estimator and method == "score" and queue == None: + pytest.skip(f"variation observed in KMeans.score") + + est = PATCHED_MODELS[estimator]() + + if method and not hasattr(est, method): + pytest.skip(f"sklearn available_if prevents testing {estimator}.{method}") + + params = est.get_params().copy() + if "random_state" in params: + params["random_state"] = 0 + est.set_params(**params) + + datasets = gen_dataset(est, datasets=_dataset_dict, queue=queue, target_df=dataframe) + _run_test(est, method, datasets) + + +@pytest.mark.allow_sklearn_fallback +@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues("numpy")) +@pytest.mark.parametrize("estimator, method", gen_models_info(SPECIAL_INSTANCES)) +def test_special_estimator_stability(estimator, method, dataframe, queue): + if queue is None and estimator in ["LogisticRegression(solver='newton-cg')"]: + pytest.skip(f"stability not guaranteed for {estimator}") + if "KMeans" in estimator and method == "score" and queue == None: + pytest.skip(f"variation observed in KMeans.score") + + est = SPECIAL_INSTANCES[estimator] + + if method and not hasattr(est, method): + pytest.skip(f"sklearn available_if prevents testing {estimator}.{method}") + + params = est.get_params().copy() + if "random_state" in params: + params["random_state"] = 0 + est.set_params(**params) + + datasets = gen_dataset(est, datasets=_dataset_dict, queue=queue, target_df=dataframe) + _run_test(est, method, datasets) + + +@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues("numpy")) +@pytest.mark.parametrize("estimator, method", gen_models_info(SPARSE_INSTANCES)) +def test_sparse_estimator_stability(estimator, method, dataframe, queue): + if "KMeans" in estimator and method == "score" and queue == None: + pytest.skip(f"variation observed in KMeans.score") + + est = SPARSE_INSTANCES[estimator] + + if method and not hasattr(est, method): + pytest.skip(f"sklearn available_if prevents testing {estimator}.{method}") + + params = est.get_params().copy() + if "random_state" in params: + params["random_state"] = 0 + est.set_params(**params) + + datasets = gen_dataset( + est, sparse=True, datasets=_dataset_dict, queue=queue, target_df=dataframe + ) + _run_test(est, method, datasets) + + +@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues("numpy")) +@pytest.mark.parametrize("estimator, method", gen_models_info(STABILITY_INSTANCES)) +def test_other_estimator_stability(estimator, method, dataframe, queue): + if "KMeans" in estimator and method == "score" and queue == None: + pytest.skip(f"variation observed in KMeans.score") + + est = STABILITY_INSTANCES[estimator] + + if method and not hasattr(est, method): + pytest.skip(f"sklearn available_if prevents testing {estimator}.{method}") + + params = est.get_params().copy() + if "random_state" in params: + params["random_state"] = 0 + est.set_params(**params) + + datasets = gen_dataset(est, datasets=_dataset_dict, queue=queue, target_df=dataframe) + _run_test(est, method, datasets) + + +@pytest.mark.parametrize("features", range(5, 10)) +def test_train_test_split(features): + X, y = make_classification( + n_samples=4000, + n_features=features, + n_informative=features, + n_redundant=0, + n_clusters_per_class=8, + random_state=0, + ) + ( + baseline_X_train, + baseline_X_test, + baseline_y_train, + baseline_y_test, + ) = train_test_split(X, y, test_size=0.33, random_state=0) + baseline = [baseline_X_train, baseline_X_test, baseline_y_train, baseline_y_test] + for _ in range(10): + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.33, random_state=0 + ) + res = [X_train, X_test, y_train, y_test] + for a, b in zip(res, baseline): + np.testing.assert_allclose( + a, b, rtol=0.0, atol=0.0, err_msg=str("train_test_split is incorrect") + ) + + +@pytest.mark.parametrize("metric", ["cosine", "correlation"]) +def test_pairwise_distances(metric): + X = np.random.rand(1000) + X = np.array(X, dtype=np.float64) + baseline = pairwise_distances(X.reshape(1, -1), metric=metric) + for _ in range(5): + res = pairwise_distances(X.reshape(1, -1), metric=metric) + for a, b in zip(res, baseline): + np.testing.assert_allclose( + a, b, rtol=0.0, atol=0.0, err_msg=str("pairwise_distances is incorrect") + ) + + +@pytest.mark.parametrize("array_size", [100, 1000, 10000]) +def test_roc_auc(array_size): + a = [random.randint(0, 1) for i in range(array_size)] + b = [random.randint(0, 1) for i in range(array_size)] + baseline = roc_auc_score(a, b) + for _ in range(5): + res = roc_auc_score(a, b) + np.testing.assert_allclose( + baseline, res, rtol=0.0, atol=0.0, err_msg=str("roc_auc is incorrect") + ) diff --git a/sklearnex/tests/test_run_to_run_stability_tests.py b/sklearnex/tests/test_run_to_run_stability_tests.py deleted file mode 100755 index be67bae1d1..0000000000 --- a/sklearnex/tests/test_run_to_run_stability_tests.py +++ /dev/null @@ -1,428 +0,0 @@ -# =============================================================================== -# Copyright 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import random - -import numpy as np -import pytest - -import daal4py as d4p -from sklearnex import patch_sklearn - -patch_sklearn() - -from scipy import sparse -from sklearn.cluster import DBSCAN, KMeans -from sklearn.datasets import ( - load_breast_cancer, - load_diabetes, - load_iris, - make_classification, - make_regression, -) -from sklearn.decomposition import PCA -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import ( - ElasticNet, - Lasso, - LinearRegression, - LogisticRegression, - LogisticRegressionCV, - Ridge, -) -from sklearn.manifold import TSNE -from sklearn.metrics import pairwise_distances, roc_auc_score -from sklearn.model_selection import train_test_split -from sklearn.neighbors import ( - KNeighborsClassifier, - KNeighborsRegressor, - LocalOutlierFactor, - NearestNeighbors, -) -from sklearn.svm import SVC, SVR, NuSVC, NuSVR - -from daal4py.sklearn._utils import daal_check_version - -# to reproduce errors even in CI -d4p.daalinit(nthreads=100) - - -def get_class_name(x): - return x.__class__.__name__ - - -def method_processing(X, clf, methods): - res = [] - name = [] - for i in methods: - if i == "predict": - res.append(clf.predict(X)) - name.append(get_class_name(clf) + ".predict(X)") - elif i == "predict_proba": - res.append(clf.predict_proba(X)) - name.append(get_class_name(clf) + ".predict_proba(X)") - elif i == "decision_function": - res.append(clf.decision_function(X)) - name.append(get_class_name(clf) + ".decision_function(X)") - elif i == "kneighbors": - dist, idx = clf.kneighbors(X) - res.append(dist) - name.append("dist") - res.append(idx) - name.append("idx") - elif i == "fit_predict": - predict = clf.fit_predict(X) - res.append(predict) - name.append(get_class_name(clf) + ".fit_predict") - elif i == "fit_transform": - res.append(clf.fit_transform(X)) - name.append(get_class_name(clf) + ".fit_transform") - elif i == "transform": - res.append(clf.transform(X)) - name.append(get_class_name(clf) + ".transform(X)") - elif i == "get_covariance": - res.append(clf.get_covariance()) - name.append(get_class_name(clf) + ".get_covariance()") - elif i == "get_precision": - res.append(clf.get_precision()) - name.append(get_class_name(clf) + ".get_precision()") - elif i == "score_samples": - res.append(clf.score_samples(X)) - name.append(get_class_name(clf) + ".score_samples(X)") - return res, name - - -def func(X, Y, clf, methods): - clf.fit(X, Y) - res, name = method_processing(X, clf, methods) - - for i in clf.__dict__.keys(): - ans = getattr(clf, i) - if isinstance(ans, (bool, float, int, np.ndarray, np.float64)): - if isinstance(ans, np.ndarray) and None in ans: - continue - res.append(ans) - name.append(get_class_name(clf) + "." + i) - return res, name - - -def _run_test(model, methods, dataset): - datasets = [] - if dataset in ["blobs", "classifier", "sparse"]: - X1, y1 = load_iris(return_X_y=True) - if dataset == "sparse": - X1 = sparse.csr_matrix(X1) - datasets.append((X1, y1)) - X2, y2 = load_breast_cancer(return_X_y=True) - if dataset == "sparse": - X2 = sparse.csr_matrix(X2) - datasets.append((X2, y2)) - elif dataset == "regression": - X1, y1 = make_regression( - n_samples=500, n_features=10, noise=64.0, random_state=42 - ) - datasets.append((X1, y1)) - X2, y2 = load_diabetes(return_X_y=True) - datasets.append((X2, y2)) - else: - raise ValueError("Unknown dataset type") - - for X, y in datasets: - baseline, name = func(X, y, model, methods) - for i in range(10): - res, _ = func(X, y, model, methods) - - for a, b, n in zip(res, baseline, name): - np.testing.assert_allclose( - a, b, rtol=0.0, atol=0.0, err_msg=str(n + " is incorrect") - ) - - -MODELS_INFO = [ - { - "model": KNeighborsClassifier( - n_neighbors=10, algorithm="brute", weights="uniform" - ), - "methods": ["predict", "predict_proba", "kneighbors"], - "dataset": "classifier", - }, - { - "model": KNeighborsClassifier( - n_neighbors=10, algorithm="brute", weights="distance" - ), - "methods": ["predict", "predict_proba", "kneighbors"], - "dataset": "classifier", - }, - { - "model": KNeighborsClassifier( - n_neighbors=10, algorithm="kd_tree", weights="uniform" - ), - "methods": ["predict", "predict_proba", "kneighbors"], - "dataset": "classifier", - }, - { - "model": KNeighborsClassifier( - n_neighbors=10, algorithm="kd_tree", weights="distance" - ), - "methods": ["predict", "predict_proba", "kneighbors"], - "dataset": "classifier", - }, - { - "model": KNeighborsRegressor( - n_neighbors=10, algorithm="kd_tree", weights="distance" - ), - "methods": ["predict", "kneighbors"], - "dataset": "regression", - }, - { - "model": KNeighborsRegressor( - n_neighbors=10, algorithm="kd_tree", weights="uniform" - ), - "methods": ["predict", "kneighbors"], - "dataset": "regression", - }, - { - "model": KNeighborsRegressor( - n_neighbors=10, algorithm="brute", weights="distance" - ), - "methods": ["predict", "kneighbors"], - "dataset": "regression", - }, - { - "model": KNeighborsRegressor( - n_neighbors=10, algorithm="brute", weights="uniform" - ), - "methods": ["predict", "kneighbors"], - "dataset": "regression", - }, - { - "model": NearestNeighbors(n_neighbors=10, algorithm="brute"), - "methods": ["kneighbors"], - "dataset": "blobs", - }, - { - "model": NearestNeighbors(n_neighbors=10, algorithm="kd_tree"), - "methods": ["kneighbors"], - "dataset": "blobs", - }, - { - "model": LocalOutlierFactor(n_neighbors=10, novelty=False), - "methods": ["fit_predict"], - "dataset": "blobs", - }, - { - "model": LocalOutlierFactor(n_neighbors=10, novelty=True), - "methods": ["predict"], - "dataset": "blobs", - }, - { - "model": DBSCAN(algorithm="brute", n_jobs=-1), - "methods": [], - "dataset": "blobs", - }, - { - "model": SVC(kernel="rbf"), - "methods": ["predict", "decision_function"], - "dataset": "classifier", - }, - { - "model": SVC(kernel="rbf"), - "methods": ["predict", "decision_function"], - "dataset": "sparse", - }, - { - "model": NuSVC(kernel="rbf"), - "methods": ["predict", "decision_function"], - "dataset": "classifier", - }, - { - "model": SVR(kernel="rbf"), - "methods": ["predict"], - "dataset": "regression", - }, - { - "model": NuSVR(kernel="rbf"), - "methods": ["predict"], - "dataset": "regression", - }, - { - "model": TSNE(random_state=0), - "methods": ["fit_transform"], - "dataset": "classifier", - }, - { - "model": KMeans(random_state=0, init="k-means++"), - "methods": ["predict"], - "dataset": "blobs", - }, - { - "model": KMeans(random_state=0, init="random"), - "methods": ["predict"], - "dataset": "blobs", - }, - { - "model": KMeans(random_state=0, init="k-means++"), - "methods": ["predict"], - "dataset": "sparse", - }, - { - "model": KMeans(random_state=0, init="random"), - "methods": ["predict"], - "dataset": "sparse", - }, - { - "model": ElasticNet(random_state=0), - "methods": ["predict"], - "dataset": "regression", - }, - { - "model": Lasso(random_state=0), - "methods": ["predict"], - "dataset": "regression", - }, - { - "model": PCA(n_components=0.5, svd_solver="covariance_eigh", random_state=0), - "methods": ["transform", "get_covariance", "get_precision", "score_samples"], - "dataset": "classifier", - }, - { - "model": RandomForestClassifier( - random_state=0, oob_score=True, max_samples=0.5, max_features="sqrt" - ), - "methods": ["predict", "predict_proba"], - "dataset": "classifier", - }, - { - "model": LogisticRegression(random_state=0, solver="newton-cg", max_iter=1000), - "methods": ["predict", "predict_proba"], - "dataset": "classifier", - }, - { - "model": LogisticRegression(random_state=0, solver="lbfgs", max_iter=1000), - "methods": ["predict", "predict_proba"], - "dataset": "classifier", - }, - { - "model": LogisticRegressionCV( - random_state=0, solver="newton-cg", n_jobs=-1, max_iter=1000 - ), - "methods": ["predict", "predict_proba"], - "dataset": "classifier", - }, - { - "model": LogisticRegressionCV( - random_state=0, solver="lbfgs", n_jobs=-1, max_iter=1000 - ), - "methods": ["predict", "predict_proba"], - "dataset": "classifier", - }, - { - "model": RandomForestRegressor( - random_state=0, oob_score=True, max_samples=0.5, max_features="sqrt" - ), - "methods": ["predict"], - "dataset": "regression", - }, - { - "model": LinearRegression(), - "methods": ["predict"], - "dataset": "regression", - }, - { - "model": Ridge(random_state=0), - "methods": ["predict"], - "dataset": "regression", - }, -] - -TO_SKIP = [ - "TSNE", # Absolute diff is 1e-10, potential problem in KNN, - # will be fixed for next release. (UPD. KNN is fixed but there is a problem - # with stability of stock sklearn. It is already stable in master, so, we - # need to wait for the next sklearn release) - "LogisticRegression", # Absolute diff is 1e-8, will be fixed for next release - "LogisticRegressionCV", # Absolute diff is 1e-10, will be fixed for next release - "RandomForestRegressor", # Absolute diff is 1e-14 in OOB score, - # will be fixed for next release -] - - -@pytest.mark.parametrize("model_head", MODELS_INFO) -def test_models(model_head): - stable_algos = [] - if get_class_name(model_head["model"]) in stable_algos and daal_check_version( - (2021, "P", 300) - ): - try: - TO_SKIP.remove(get_class_name(model_head["model"])) - except ValueError: - pass - if get_class_name(model_head["model"]) in TO_SKIP: - pytest.skip("Unstable", allow_module_level=False) - _run_test(model_head["model"], model_head["methods"], model_head["dataset"]) - - -@pytest.mark.parametrize("features", range(5, 10)) -def test_train_test_split(features): - X, y = make_classification( - n_samples=4000, - n_features=features, - n_informative=features, - n_redundant=0, - n_clusters_per_class=8, - random_state=0, - ) - ( - baseline_X_train, - baseline_X_test, - baseline_y_train, - baseline_y_test, - ) = train_test_split(X, y, test_size=0.33, random_state=0) - baseline = [baseline_X_train, baseline_X_test, baseline_y_train, baseline_y_test] - for _ in range(10): - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.33, random_state=0 - ) - res = [X_train, X_test, y_train, y_test] - for a, b in zip(res, baseline): - np.testing.assert_allclose( - a, b, rtol=0.0, atol=0.0, err_msg=str("train_test_split is incorrect") - ) - - -@pytest.mark.parametrize("metric", ["cosine", "correlation"]) -def test_pairwise_distances(metric): - X = np.random.rand(1000) - X = np.array(X, dtype=np.float64) - baseline = pairwise_distances(X.reshape(1, -1), metric=metric) - for _ in range(5): - res = pairwise_distances(X.reshape(1, -1), metric=metric) - for a, b in zip(res, baseline): - np.testing.assert_allclose( - a, b, rtol=0.0, atol=0.0, err_msg=str("pairwise_distances is incorrect") - ) - - -@pytest.mark.parametrize("array_size", [100, 1000, 10000]) -def test_roc_auc(array_size): - a = [random.randint(0, 1) for i in range(array_size)] - b = [random.randint(0, 1) for i in range(array_size)] - baseline = roc_auc_score(a, b) - for _ in range(5): - res = roc_auc_score(a, b) - np.testing.assert_allclose( - baseline, res, rtol=0.0, atol=0.0, err_msg=str("roc_auc is incorrect") - ) From 33845eaa6199e972553f1a5bbe07c1cb1cfe4d3c Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Mon, 10 Jun 2024 15:23:22 +0100 Subject: [PATCH 17/75] Add sklearn 1.5 to CI matrix (#1859) * Add sklearn 1.5 to CI * Deselect PCA solver selection tests * Update LogisticRegression parameter * Remove py38 from CI matrix * Fix for global patch: add temp. _threadpool_controller in dispatcher * Change pytest versions * Update test reqs * Remove unnecessary check * Set skl 1.0 py310 in CI matrix and revert some previous changes * Change sklearn 1.0 python version * Skip dpctl/dpnp install for sklearn 1.0 * CHange pytest versions --- .ci/pipeline/build-and-test-lnx.yml | 2 +- .ci/pipeline/ci.yml | 14 ++++++++++---- deselected_tests.yaml | 5 +++++ requirements-test.txt | 6 +++--- sklearnex/glob/dispatcher.py | 18 ++++++++++++++++-- sklearnex/linear_model/logistic_regression.py | 2 +- 6 files changed, 36 insertions(+), 11 deletions(-) diff --git a/.ci/pipeline/build-and-test-lnx.yml b/.ci/pipeline/build-and-test-lnx.yml index a31c17d4c1..4aa56b4e72 100644 --- a/.ci/pipeline/build-and-test-lnx.yml +++ b/.ci/pipeline/build-and-test-lnx.yml @@ -46,7 +46,7 @@ steps: bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION) pip install --upgrade -r requirements-test.txt pip install $(python .ci/scripts/get_compatible_scipy_version.py) - if [ $(echo $(PYTHON_VERSION) | grep '3.9\|3.10') ]; then conda install -q -y -c intel dpctl=0.16.0 dpnp=0.14.0; fi + if [ $(echo $(PYTHON_VERSION) | grep '3.9\|3.10') ] && [ $(SKLEARN_VERSION) != "1.0" ]; then conda install -q -y -c intel dpctl=0.16.0 dpnp=0.14.0; fi pip list displayName: "Install testing requirements" - script: | diff --git a/.ci/pipeline/ci.yml b/.ci/pipeline/ci.yml index 42613ddc59..3a71b3323e 100644 --- a/.ci/pipeline/ci.yml +++ b/.ci/pipeline/ci.yml @@ -60,8 +60,8 @@ jobs: timeoutInMinutes: 120 strategy: matrix: - Python3.8_Sklearn1.0: - PYTHON_VERSION: '3.8' + Python3.9_Sklearn1.0: + PYTHON_VERSION: '3.9' SKLEARN_VERSION: '1.0' Python3.9_Sklearn1.1: PYTHON_VERSION: '3.9' @@ -75,6 +75,9 @@ jobs: Python3.12_Sklearn1.4: PYTHON_VERSION: '3.12' SKLEARN_VERSION: '1.4' + Python3.12_Sklearn1.5: + PYTHON_VERSION: '3.12' + SKLEARN_VERSION: '1.5' pool: vmImage: 'ubuntu-22.04' steps: @@ -84,8 +87,8 @@ jobs: timeoutInMinutes: 120 strategy: matrix: - Python3.8_Sklearn1.0: - PYTHON_VERSION: '3.8' + Python3.9_Sklearn1.0: + PYTHON_VERSION: '3.9' SKLEARN_VERSION: '1.0' Python3.9_Sklearn1.1: PYTHON_VERSION: '3.9' @@ -99,6 +102,9 @@ jobs: Python3.12_Sklearn1.4: PYTHON_VERSION: '3.12' SKLEARN_VERSION: '1.4' + Python3.12_Sklearn1.5: + PYTHON_VERSION: '3.12' + SKLEARN_VERSION: '1.5' pool: vmImage: 'windows-latest' steps: diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 0dd53be456..cd153b1f71 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -29,6 +29,11 @@ deselected_tests: - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-kulsinski] <1.3 - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[kulsinski] <1.3 + # sklearnex PCA always chooses "covariance_eigh" solver instead of "full" when solver="auto" + # resulting in solver assignment check failure for sklearn version >= 1.5 + - decomposition/tests/test_pca.py::test_pca_svd_solver_auto[1000-500-400-full] >=1.5 + - decomposition/tests/test_pca.py::test_pca_svd_solver_auto[1000-500-0.5-full] >=1.5 + # test for KMeans FutureWarning is not removed from sklearn tests suit yet - cluster/tests/test_k_means.py::test_change_n_init_future_warning[KMeans-10] ==1.4.dev0 diff --git a/requirements-test.txt b/requirements-test.txt index cc70aa05b0..247ab2be08 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,11 +1,11 @@ -pytest==7.4.4 ; python_version == '3.8' -pytest==8.2.2 ; python_version >= '3.9' +pytest==7.4.4 ; python_version <= '3.9' +pytest==8.2.2 ; python_version >= '3.10' numpy>=1.19.5 ; python_version <= '3.9' numpy>=1.21.6 ; python_version == '3.10' numpy>=1.23.5 ; python_version == '3.11' numpy>=2.0.0rc2 ; python_version >= '3.12' scikit-learn==1.2.2 ; python_version == '3.8' -scikit-learn==1.4.2 ; python_version >= '3.9' +scikit-learn==1.5.0 ; python_version >= '3.9' pandas==2.0.3 ; python_version == '3.8' pandas==2.1.3 ; python_version >= '3.9' and python_version < '3.11' pandas==2.2.2 ; python_version >= '3.11' diff --git a/sklearnex/glob/dispatcher.py b/sklearnex/glob/dispatcher.py index a78586f39f..47967d63e3 100755 --- a/sklearnex/glob/dispatcher.py +++ b/sklearnex/glob/dispatcher.py @@ -17,18 +17,32 @@ def get_patch_str(name=None, verbose=True): return f"""try: + # TEMP. FIX: sklearnex.patch_sklearn imports sklearn beforehand + # when it didn't initialized _threadpool_controller required for + # pairwise distances dispatching during imports. + # Manually setting and deleting _threadpool_controller during patch fixes it. + import sklearn + from threadpoolctl import ThreadpoolController + sklearn._threadpool_controller = ThreadpoolController() from sklearnex import patch_sklearn patch_sklearn(name={str(name)}, verbose={str(verbose)}) - del patch_sklearn + del patch_sklearn, sklearn._threadpool_controller except ImportError: pass""" def get_patch_str_re(): return r"""\ntry: + \# TEMP. FIX: sklearnex.patch_sklearn imports sklearn beforehand + \# when it didn't initialized _threadpool_controller required for + \# pairwise distances dispatching during imports. + \# Manually setting and deleting _threadpool_controller during patch fixes it. + import sklearn + from threadpoolctl import ThreadpoolController + sklearn._threadpool_controller = ThreadpoolController\(\) from sklearnex import patch_sklearn patch_sklearn\(name=.*, verbose=.*\) - del patch_sklearn + del patch_sklearn, sklearn._threadpool_controller except ImportError: pass\n""" diff --git a/sklearnex/linear_model/logistic_regression.py b/sklearnex/linear_model/logistic_regression.py index 9dbc95feb0..6e1883f87f 100644 --- a/sklearnex/linear_model/logistic_regression.py +++ b/sklearnex/linear_model/logistic_regression.py @@ -82,7 +82,7 @@ def __init__( random_state=None, solver="lbfgs", max_iter=100, - multi_class="auto", + multi_class="deprecated" if sklearn_check_version("1.5") else "auto", verbose=0, warm_start=False, n_jobs=None, From 9644eb487e0ebc392f070b93dcc55daf4a85e8ec Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 10 Jun 2024 15:23:34 +0100 Subject: [PATCH 18/75] Update dependency cmake to v3.29.5 (#1858) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- dependencies-dev | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependencies-dev b/dependencies-dev index d30c78447a..9156827cc9 100644 --- a/dependencies-dev +++ b/dependencies-dev @@ -3,4 +3,4 @@ Jinja2==3.1.4 numpy==1.19.5 ; python_version < '3.9' numpy==2.0.0rc2 ; python_version >= '3.9' pybind11==2.12.0 -cmake==3.29.3 +cmake==3.29.5 From 563c65a41f16013b8aebad9fb9190dbb08440806 Mon Sep 17 00:00:00 2001 From: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Date: Mon, 10 Jun 2024 13:48:43 -0700 Subject: [PATCH 19/75] CI: Initial additions for fp32 and windows GPU test support (#1778) * debug * brute force moments cpp * remove debug * debug * oops * remove debug * require lnx for spmd examples * tolerance updates * minor threshold revisions * trying PCA fix * revert last check * address current tolerance/fp64 fails * lint * additional small fixes * minor inclinreg y dtype * forest test skips * skip windows gpu logreg * logreg and forest adjustments * et regressor gpu skip * lint * Update onedal/cluster/tests/test_kmeans_init.py Co-authored-by: Samir Nasibli * remove multiple assert_all_finite calls * removing logreg skips due to resolution * add convert_to_supported for svm * pca dtype derived from results * add forgotten queue --------- Co-authored-by: Samir Nasibli --- .../incremental_basic_statistics.py | 4 +++- onedal/cluster/tests/test_kmeans_init.py | 2 ++ .../tests/test_incremental_pca.py | 6 +++--- .../linear_model/incremental_linear_model.py | 12 ++++------- .../primitives/tests/test_kernel_functions.py | 3 ++- onedal/svm/svm.py | 5 ++++- sklearnex/decomposition/tests/test_pca.py | 6 ++++-- sklearnex/ensemble/tests/test_forest.py | 11 ++++++++-- sklearnex/linear_model/incremental_linear.py | 3 +++ .../tests/test_incremental_linear.py | 20 +++++++++---------- sklearnex/linear_model/tests/test_linear.py | 4 ++-- .../tests/test_incremental_pca.py | 4 ++-- tests/run_examples.py | 17 +++++++++++++++- 13 files changed, 64 insertions(+), 33 deletions(-) diff --git a/onedal/basic_statistics/incremental_basic_statistics.py b/onedal/basic_statistics/incremental_basic_statistics.py index df073b55ab..eb77625628 100644 --- a/onedal/basic_statistics/incremental_basic_statistics.py +++ b/onedal/basic_statistics/incremental_basic_statistics.py @@ -138,11 +138,13 @@ def partial_fit(self, X, weights=None, queue=None): """ if not hasattr(self, "_policy"): self._policy = self._get_policy(queue, X) + + X, weights = _convert_to_supported(self._policy, X, weights) + if not hasattr(self, "_onedal_params"): dtype = get_dtype(X) self._onedal_params = self._get_onedal_params(dtype) - X, weights = _convert_to_supported(self._policy, X, weights) X_table, weights_table = to_table(X, weights) self._partial_result = _backend.basic_statistics.compute.partial_compute( self._policy, diff --git a/onedal/cluster/tests/test_kmeans_init.py b/onedal/cluster/tests/test_kmeans_init.py index 932918aa53..97c5483b23 100755 --- a/onedal/cluster/tests/test_kmeans_init.py +++ b/onedal/cluster/tests/test_kmeans_init.py @@ -85,6 +85,8 @@ def test_generated_dataset(queue, dtype, n_dim, n_cluster): d, i = nn.fit(rs_centroids).kneighbors(cs) # We have applied 2 sigma rule once desired_accuracy = int(0.9973 * n_cluster) + if d.dtype == np.float64: + desired_accuracy = desired_accuracy - 1 correctness = d.reshape(-1) <= (vs * 3) exp_accuracy = np.count_nonzero(correctness) diff --git a/onedal/decomposition/tests/test_incremental_pca.py b/onedal/decomposition/tests/test_incremental_pca.py index f22991b055..f2054c210b 100644 --- a/onedal/decomposition/tests/test_incremental_pca.py +++ b/onedal/decomposition/tests/test_incremental_pca.py @@ -73,7 +73,7 @@ def test_on_gold_data(queue, is_deterministic, whiten, num_blocks, dtype): ) tol = 1e-7 - if dtype == np.float32: + if transformed_data.dtype == np.float32: tol = 7e-6 if whiten else 1e-6 assert result.n_components_ == expected_n_components_ @@ -127,8 +127,8 @@ def test_on_random_data( incpca.finalize_fit() - transformed_data = incpca.predict(X) - tol = 3e-3 if dtype == np.float32 else 2e-6 + transformed_data = incpca.predict(X, queue=queue) + tol = 3e-3 if transformed_data.dtype == np.float32 else 2e-6 n_components = incpca.n_components_ expected_n_samples_seen = X.shape[0] diff --git a/onedal/linear_model/incremental_linear_model.py b/onedal/linear_model/incremental_linear_model.py index 7557e7f66b..b8b754e18f 100644 --- a/onedal/linear_model/incremental_linear_model.py +++ b/onedal/linear_model/incremental_linear_model.py @@ -77,22 +77,18 @@ def partial_fit(self, X, y, queue=None): if not hasattr(self, "_policy"): self._policy = self._get_policy(queue, X) + X, y = _convert_to_supported(self._policy, X, y) + if not hasattr(self, "_dtype"): self._dtype = get_dtype(X) self._params = self._get_onedal_params(self._dtype) - if self._dtype not in [np.float32, np.float64]: - self._dtype = np.float64 - - X = X.astype(self._dtype, copy=self.copy_X) - y = y.astype(dtype=self._dtype) + y = np.asarray(y).astype(dtype=self._dtype) self._y_ndim_1 = y.ndim == 1 - X, y = _check_X_y(X, y, force_all_finite=False, accept_2d_y=True) + X, y = _check_X_y(X, y, dtype=[np.float64, np.float32], accept_2d_y=True) self.n_features_in_ = _num_features(X, fallback_1d=True) - - X, y = _convert_to_supported(self._policy, X, y) X_table, y_table = to_table(X, y) hparams = get_hyperparameters("linear_regression", "train") if hparams is not None and not hparams.is_default: diff --git a/onedal/primitives/tests/test_kernel_functions.py b/onedal/primitives/tests/test_kernel_functions.py index de9f5921dd..661f3b8698 100644 --- a/onedal/primitives/tests/test_kernel_functions.py +++ b/onedal/primitives/tests/test_kernel_functions.py @@ -66,7 +66,8 @@ def test_dense_self_rbf_kernel(queue): result = rbf_kernel(X, queue=queue) expected = sklearn_rbf_kernel(X) - assert_allclose(result, expected, rtol=1e-14) + tol = 1e-5 if result.dtype == np.float32 else 1e-14 + assert_allclose(result, expected, rtol=tol) def _test_dense_small_rbf_kernel(queue, gamma, dtype): diff --git a/onedal/svm/svm.py b/onedal/svm/svm.py index 6f6a46fac8..41f476e683 100644 --- a/onedal/svm/svm.py +++ b/onedal/svm/svm.py @@ -25,7 +25,7 @@ from ..common._estimator_checks import _check_is_fitted from ..common._mixin import ClassifierMixin, RegressorMixin from ..common._policy import _get_policy -from ..datatypes import from_table, to_table +from ..datatypes import _convert_to_supported, from_table, to_table from ..utils import ( _check_array, _check_n_features, @@ -174,6 +174,7 @@ def _fit(self, X, y, sample_weight, module, queue): self._scale_, self._sigma_ = _gamma, np.sqrt(0.5 / _gamma) policy = _get_policy(queue, *data) + X = _convert_to_supported(policy, X) params = self._get_onedal_params(X) result = module.train(policy, params, *to_table(*data)) @@ -252,6 +253,7 @@ def _predict(self, X, module, queue): ) policy = _get_policy(queue, X) + X = _convert_to_supported(policy, X) params = self._get_onedal_params(X) if hasattr(self, "_onedal_model"): @@ -308,6 +310,7 @@ def _decision_function(self, X, module, queue): ) policy = _get_policy(queue, X) + X = _convert_to_supported(policy, X) params = self._get_onedal_params(X) if hasattr(self, "_onedal_model"): diff --git a/sklearnex/decomposition/tests/test_pca.py b/sklearnex/decomposition/tests/test_pca.py index 4e4ff91d11..5f8270d80c 100755 --- a/sklearnex/decomposition/tests/test_pca.py +++ b/sklearnex/decomposition/tests/test_pca.py @@ -51,6 +51,8 @@ def test_sklearnex_import(dataframe, queue): assert hasattr(pca, "_onedal_estimator") else: assert "daal4py" in pca.__module__ + + tol = 1e-5 if _as_numpy(X_transformed).dtype == np.float32 else 1e-7 assert_allclose([6.30061232, 0.54980396], _as_numpy(pca.singular_values_)) - assert_allclose(X_transformed_expected, _as_numpy(X_transformed)) - assert_allclose(X_transformed_expected, _as_numpy(X_fit_transformed)) + assert_allclose(X_transformed_expected, _as_numpy(X_transformed), rtol=tol) + assert_allclose(X_transformed_expected, _as_numpy(X_fit_transformed), rtol=tol) diff --git a/sklearnex/ensemble/tests/test_forest.py b/sklearnex/ensemble/tests/test_forest.py index a1d30b4d93..80e0e1f61b 100644 --- a/sklearnex/ensemble/tests/test_forest.py +++ b/sklearnex/ensemble/tests/test_forest.py @@ -14,6 +14,7 @@ # limitations under the License. # =============================================================================== +import numpy as np import pytest from numpy.testing import assert_allclose from sklearn.datasets import make_classification, make_regression @@ -45,7 +46,10 @@ def test_sklearnex_import_rf_classifier(dataframe, queue): assert_allclose([1], _as_numpy(rf.predict([[0, 0, 0, 0]]))) -@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +# TODO: fix RF regressor predict for the GPU sycl_queue. +@pytest.mark.parametrize( + "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu") +) def test_sklearnex_import_rf_regression(dataframe, queue): from sklearnex.ensemble import RandomForestRegressor @@ -65,7 +69,10 @@ def test_sklearnex_import_rf_regression(dataframe, queue): assert_allclose([-6.839], pred, atol=1e-2) -@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +# TODO: fix ET classifier predict for the GPU sycl_queue. +@pytest.mark.parametrize( + "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu") +) def test_sklearnex_import_et_classifier(dataframe, queue): from sklearnex.ensemble import ExtraTreesClassifier diff --git a/sklearnex/linear_model/incremental_linear.py b/sklearnex/linear_model/incremental_linear.py index 2a56c71072..fdcc418f3f 100644 --- a/sklearnex/linear_model/incremental_linear.py +++ b/sklearnex/linear_model/incremental_linear.py @@ -167,18 +167,21 @@ def _onedal_partial_fit(self, X, y, queue=None): reset=first_pass, copy=self.copy_X, multi_output=True, + force_all_finite=False, ) else: X = check_array( X, dtype=[np.float64, np.float32], copy=self.copy_X, + force_all_finite=False, ) y = check_array( y, dtype=[np.float64, np.float32], copy=False, ensure_2d=False, + force_all_finite=False, ) if first_pass: diff --git a/sklearnex/linear_model/tests/test_incremental_linear.py b/sklearnex/linear_model/tests/test_incremental_linear.py index 2f77fa45d0..54c33239ee 100644 --- a/sklearnex/linear_model/tests/test_incremental_linear.py +++ b/sklearnex/linear_model/tests/test_incremental_linear.py @@ -47,7 +47,7 @@ def test_sklearnex_fit_on_gold_data(dataframe, queue, fit_intercept, macro_block y_pred = inclin.predict(X_df) - tol = 2e-6 if dtype == np.float32 else 1e-7 + tol = 2e-6 if y_pred.dtype == np.float32 else 1e-7 assert_allclose(inclin.coef_, [1], atol=tol) if fit_intercept: assert_allclose(inclin.intercept_, [0], atol=tol) @@ -82,15 +82,15 @@ def test_sklearnex_partial_fit_on_gold_data( ) inclin.partial_fit(X_split_df, y_split_df) + X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + y_pred = inclin.predict(X_df) + assert inclin.n_features_in_ == 1 - tol = 2e-6 if dtype == np.float32 else 1e-7 + tol = 2e-6 if y_pred.dtype == np.float32 else 1e-7 assert_allclose(inclin.coef_, [[1]], atol=tol) if fit_intercept: assert_allclose(inclin.intercept_, 3, atol=tol) - X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) - y_pred = inclin.predict(X_df) - assert_allclose(_as_numpy(y_pred), y, atol=tol) @@ -122,15 +122,15 @@ def test_sklearnex_partial_fit_multitarget_on_gold_data( ) inclin.partial_fit(X_split_df, y_split_df) + X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + y_pred = inclin.predict(X_df) + assert inclin.n_features_in_ == 2 - tol = 7e-6 if dtype == np.float32 else 1e-7 + tol = 7e-6 if y_pred.dtype == np.float32 else 1e-7 assert_allclose(inclin.coef_, [1.0, 2.0], atol=tol) if fit_intercept: assert_allclose(inclin.intercept_, 3.0, atol=tol) - X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) - y_pred = inclin.predict(X_df) - assert_allclose(_as_numpy(y_pred), y, atol=tol) @@ -181,7 +181,7 @@ def test_sklearnex_partial_fit_on_random_data( ) inclin.partial_fit(X_split_df, y_split_df) - tol = 1e-4 if dtype == np.float32 else 1e-7 + tol = 1e-4 if inclin.coef_.dtype == np.float32 else 1e-7 assert_allclose(coef, inclin.coef_.T, atol=tol) if fit_intercept: diff --git a/sklearnex/linear_model/tests/test_linear.py b/sklearnex/linear_model/tests/test_linear.py index c3fc9c0042..b46d2ab315 100644 --- a/sklearnex/linear_model/tests/test_linear.py +++ b/sklearnex/linear_model/tests/test_linear.py @@ -52,7 +52,7 @@ def test_sklearnex_import_linear(dataframe, queue, dtype, macro_block): assert "sklearnex" in linreg.__module__ assert linreg.n_features_in_ == 2 - tol = 1e-5 if dtype == np.float32 else 1e-7 + tol = 1e-5 if _as_numpy(linreg.coef_).dtype == np.float32 else 1e-7 assert_allclose(_as_numpy(linreg.intercept_), 3.0, rtol=tol) assert_allclose(_as_numpy(linreg.coef_), [1.0, 2.0], rtol=tol) @@ -113,5 +113,5 @@ def test_sklearnex_reconstruct_model(dataframe, queue, dtype): y_pred = linreg.predict(X) - tol = 1e-5 if dtype == np.float32 else 1e-7 + tol = 1e-5 if _as_numpy(y_pred).dtype == np.float32 else 1e-7 assert_allclose(gtr, _as_numpy(y_pred), rtol=tol) diff --git a/sklearnex/preview/decomposition/tests/test_incremental_pca.py b/sklearnex/preview/decomposition/tests/test_incremental_pca.py index 67929bfac8..786ae4fef0 100644 --- a/sklearnex/preview/decomposition/tests/test_incremental_pca.py +++ b/sklearnex/preview/decomposition/tests/test_incremental_pca.py @@ -74,7 +74,7 @@ def check_pca_on_gold_data(incpca, dtype, whiten, transformed_data): ) tol = 1e-7 - if dtype == np.float32: + if transformed_data.dtype == np.float32: tol = 7e-6 if whiten else 1e-6 assert incpca.n_samples_seen_ == expected_n_samples_seen_ @@ -112,7 +112,7 @@ def check_pca_on_gold_data(incpca, dtype, whiten, transformed_data): def check_pca(incpca, dtype, whiten, data, transformed_data): - tol = 3e-3 if dtype == np.float32 else 2e-6 + tol = 3e-3 if transformed_data.dtype == np.float32 else 2e-6 n_components = incpca.n_components_ diff --git a/tests/run_examples.py b/tests/run_examples.py index 57fb92cce1..71f3fede0a 100755 --- a/tests/run_examples.py +++ b/tests/run_examples.py @@ -176,7 +176,6 @@ def check_library(rule): req_library["basic_statistics_spmd.py"] = ["dpctl", "mpi4py"] req_library["covariance_spmd.py"] = ["dpctl", "mpi4py"] req_library["dbscan_spmd.py"] = ["dpctl", "mpi4py"] -req_library["basic_statistics_spmd.py"] = ["dpctl", "mpi4py"] req_library["incremental_basic_statistics_dpctl.py"] = ["dpctl"] req_library["incremental_linear_regression_dpctl.py"] = ["dpctl"] req_library["incremental_pca_dpctl.py"] = ["dpctl"] @@ -193,6 +192,20 @@ def check_library(rule): req_library["random_forest_regressor_spmd.py"] = ["dpctl", "dpnp", "mpi4py"] req_os = defaultdict(lambda: []) +req_os["basic_statistics_spmd.py"] = ["lnx"] +req_os["covariance_spmd.py"] = ["lnx"] +req_os["dbscan_spmd.py"] = ["lnx"] +req_os["kmeans_spmd.py"] = ["lnx"] +req_os["knn_bf_classification_dpnp.py"] = ["lnx"] +req_os["knn_bf_classification_spmd.py"] = ["lnx"] +req_os["knn_bf_regression_spmd.py"] = ["lnx"] +req_os["linear_regression_spmd.py"] = ["lnx"] +req_os["logistic_regression_spmd.py"] = ["lnx"] +req_os["pca_spmd.py"] = ["lnx"] +req_os["random_forest_classifier_dpctl.py"] = ["lnx"] +req_os["random_forest_classifier_spmd.py"] = ["lnx"] +req_os["random_forest_regressor_dpnp.py"] = ["lnx"] +req_os["random_forest_regressor_spmd.py"] = ["lnx"] skiped_files = [] @@ -229,6 +242,8 @@ def get_exe_cmd(ex, args): return None if not check_library(req_library[os.path.basename(ex)]): return None + if not check_os(req_os[os.path.basename(ex)], system_os): + return None if not args.nodist and ex.endswith("spmd.py"): if IS_WIN: return 'mpiexec -localonly -n 4 "' + sys.executable + '" "' + ex + '"' From dd99fcd863293e361c047bc40da28fe532a6a07c Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Tue, 11 Jun 2024 17:25:37 +0200 Subject: [PATCH 20/75] [testing] deselect PCA from test_run_to_run_stability::test_standard_estimator_stability (#1860) * Update test_run_to_run_stability.py * update formatting --- sklearnex/tests/test_run_to_run_stability.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/tests/test_run_to_run_stability.py b/sklearnex/tests/test_run_to_run_stability.py index be8e28da92..9e4a670fdf 100755 --- a/sklearnex/tests/test_run_to_run_stability.py +++ b/sklearnex/tests/test_run_to_run_stability.py @@ -148,8 +148,8 @@ def _run_test(estimator, method, datasets): def test_standard_estimator_stability(estimator, method, dataframe, queue): if estimator in ["LogisticRegression", "TSNE"]: pytest.skip(f"stability not guaranteed for {estimator}") - if "KMeans" in estimator and method == "score" and queue == None: - pytest.skip(f"variation observed in KMeans.score") + if estimator in ["KMeans", "PCA"] and method == "score" and queue == None: + pytest.skip(f"variation observed in {estimator}.score") est = PATCHED_MODELS[estimator]() From 7e9d2603456f0bd098d9c0b8128c63a215c0e883 Mon Sep 17 00:00:00 2001 From: Victoriya Fedotova Date: Wed, 12 Jun 2024 11:28:07 +0200 Subject: [PATCH 21/75] Add CSR data support into BasicStatistics algorithm (#1846) Support of csr_matrix and csr_array inputs is added into onedal.BasicStatistics algorithm. The respective unit tests are implemented. --- .ci/scripts/get_compatible_scipy_version.py | 10 ++- onedal/basic_statistics/basic_statistics.cpp | 17 ++++- onedal/basic_statistics/basic_statistics.py | 22 ++++-- .../tests/test_basic_statistics.py | 68 +++++++++++++++++++ onedal/cluster/kmeans.py | 4 +- onedal/datatypes/_data_conversion.py | 6 +- onedal/utils/__init__.py | 2 + onedal/utils/validation.py | 7 ++ 8 files changed, 123 insertions(+), 13 deletions(-) diff --git a/.ci/scripts/get_compatible_scipy_version.py b/.ci/scripts/get_compatible_scipy_version.py index 93a8b5d7d5..82ec0a83fa 100644 --- a/.ci/scripts/get_compatible_scipy_version.py +++ b/.ci/scripts/get_compatible_scipy_version.py @@ -23,9 +23,15 @@ print("Scipy version is not specified for this sklearn/python version.", file=stderr) print("scipy") elif sklearn_check_version("1.3") or python_version[1] > 11: - print("scipy==1.11.*") + if python_version[1] > 8: + print("scipy==1.12.*") + else: + print("scipy==1.11.*") elif sklearn_check_version("1.2") or python_version[1] > 10: - print("scipy==1.9.*") + if python_version[1] > 9: + print("scipy==1.12.*") + else: + print("scipy==1.9.*") elif sklearn_check_version("1.1"): print("scipy==1.8.*") elif sklearn_check_version("1.0"): diff --git a/onedal/basic_statistics/basic_statistics.cpp b/onedal/basic_statistics/basic_statistics.cpp index 6801f84296..35805a78ac 100644 --- a/onedal/basic_statistics/basic_statistics.cpp +++ b/onedal/basic_statistics/basic_statistics.cpp @@ -41,6 +41,7 @@ struct method2t { const auto method = params["method"].cast(); ONEDAL_PARAM_DISPATCH_VALUE(method, "dense", ops, Float, method::dense); + ONEDAL_PARAM_DISPATCH_VALUE(method, "sparse", ops, Float, method::sparse); ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default); ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method); } @@ -107,6 +108,17 @@ auto get_onedal_result_options(const py::dict& params) { } struct params2desc { + template + auto operator()(const py::dict& params) { + auto desc = dal::basic_statistics::descriptor() + .set_result_options(get_onedal_result_options(params)); + return desc; + } +}; + +/// Only dense method is supported by incremental basic statistics +struct params2desc_incremental { template auto operator()(const py::dict& params) { auto desc = dal::basic_statistics::descriptor struct init_compute_ops_dispatcher { void operator()(py::module_& m) { using Task = dal::basic_statistics::task::compute; + m.def("train", [](const Policy& policy, const py::dict& params, @@ -148,7 +161,7 @@ void init_partial_compute_ops(py::module& m) { const table& weights) { using namespace dal::basic_statistics; using input_t = partial_compute_input; - partial_compute_ops ops(policy, input_t{ prev, data, weights }, params2desc{}); + partial_compute_ops ops(policy, input_t{ prev, data, weights }, params2desc_incremental{}); return fptype2t{ method2t{ Task{}, ops } }(params); } ); @@ -159,7 +172,7 @@ void init_finalize_compute_ops(pybind11::module_& m) { using namespace dal::basic_statistics; using input_t = partial_compute_result; m.def("finalize_compute", [](const Policy& policy, const pybind11::dict& params, const input_t& data) { - finalize_compute_ops ops(policy, data, params2desc{}); + finalize_compute_ops ops(policy, data, params2desc_incremental{}); return fptype2t{ method2t{ Task{}, ops } }(params); }); } diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index 852c71dd20..c1c121d932 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -23,6 +23,7 @@ from ..common._base import BaseEstimator from ..datatypes import _convert_to_supported, from_table, to_table +from ..utils import _is_csr class BaseBasicStatistics(metaclass=ABCMeta): @@ -54,16 +55,18 @@ def _get_result_options(self, options): assert isinstance(options, str) return options - def _get_onedal_params(self, dtype=np.float32): + def _get_onedal_params(self, is_csr, dtype=np.float32): options = self._get_result_options(self.options) return { "fptype": "float" if dtype == np.float32 else "double", - "method": self.algorithm, + "method": "sparse" if is_csr else self.algorithm, "result_option": options, } - def _compute_raw(self, data_table, weights_table, module, policy, dtype=np.float32): - params = self._get_onedal_params(dtype) + def _compute_raw( + self, data_table, weights_table, module, policy, dtype=np.float32, is_csr=False + ): + params = self._get_onedal_params(is_csr, dtype) result = module.train(policy, params, data_table, weights_table) @@ -75,8 +78,10 @@ def _compute_raw(self, data_table, weights_table, module, policy, dtype=np.float def _compute(self, data, weights, module, queue): policy = self._get_policy(queue, data, weights) - if not (data is None): + is_csr = _is_csr(data) + if not (data is None) and not is_csr: data = np.asarray(data) + if not (weights is None): weights = np.asarray(weights) @@ -85,7 +90,7 @@ def _compute(self, data, weights, module, queue): data_table, weights_table = to_table(data, weights) dtype = data.dtype - res = self._compute_raw(data_table, weights_table, module, policy, dtype) + res = self._compute_raw(data_table, weights_table, module, policy, dtype, is_csr) return {k: from_table(v).ravel() for k, v in res.items()} @@ -103,11 +108,14 @@ def compute(self, data, weights=None, queue=None): data, weights, self._get_backend("basic_statistics", "compute", None), queue ) - def compute_raw(self, data_table, weights_table, policy, dtype=np.float32): + def compute_raw( + self, data_table, weights_table, policy, dtype=np.float32, is_csr=False + ): return super()._compute_raw( data_table, weights_table, self._get_backend("basic_statistics", "compute", None), policy, dtype, + is_csr, ) diff --git a/onedal/basic_statistics/tests/test_basic_statistics.py b/onedal/basic_statistics/tests/test_basic_statistics.py index 95492b4ad3..ff373bb92c 100644 --- a/onedal/basic_statistics/tests/test_basic_statistics.py +++ b/onedal/basic_statistics/tests/test_basic_statistics.py @@ -14,6 +14,8 @@ # limitations under the License. # ============================================================================== +from scipy import sparse as sp + from daal4py.sklearn._utils import daal_check_version if daal_check_version((2023, "P", 100)): @@ -32,6 +34,14 @@ ("standard_deviation", np.std, (3e-5, 3e-5)), ] + options_and_tests_csr = [ + ("sum", "sum", (5e-6, 1e-9)), + ("min", "min", (0, 0)), + # There is a bug in oneDAL's max computations on GPU + # ("max", "max", (0, 0)), + ("mean", "mean", (5e-6, 1e-9)), + ] + @pytest.mark.parametrize("queue", get_queues()) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_basic_uniform(queue, dtype): @@ -98,3 +108,61 @@ def test_option_weighted(queue, option, dtype): tol = fp32tol if res.dtype == np.float32 else fp64tol assert_allclose(gtr, res, rtol=tol) + + @pytest.mark.skipif(not hasattr(sp, "random_array"), reason="requires scipy>=1.12.0") + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + def test_basic_csr(queue, dtype): + seed = 42 + s_count, f_count = 5000, 3008 + + gen = np.random.default_rng(seed) + + data = sp.random_array( + shape=(s_count, f_count), + density=0.01, + format="csr", + dtype=dtype, + random_state=gen, + ) + + alg = BasicStatistics(result_options="mean") + res = alg.compute(data, queue=queue) + + res_mean = res["mean"] + gtr_mean = data.mean(axis=0) + tol = 5e-6 if res_mean.dtype == np.float32 else 1e-9 + assert_allclose(gtr_mean, res_mean, rtol=tol) + + @pytest.mark.skipif(not hasattr(sp, "random_array"), reason="requires scipy>=1.12.0") + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("option", options_and_tests_csr) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + def test_options_csr(queue, option, dtype): + seed = 42 + s_count, f_count = 20046, 4007 + + gen = np.random.default_rng(seed) + + data = sp.random_array( + shape=(s_count, f_count), + density=0.002, + format="csr", + dtype=dtype, + random_state=gen, + ) + + result_option, function, tols = option + fp32tol, fp64tol = tols + + alg = BasicStatistics(result_options=result_option) + res = alg.compute(data, queue=queue) + + res = res[result_option] + func = getattr(data, function) + gtr = func(axis=0) + if type(gtr).__name__ != "ndarray": + gtr = gtr.toarray().flatten() + tol = fp32tol if res.dtype == np.float32 else fp64tol + + assert_allclose(gtr, res, rtol=tol) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index c6d51e9b11..812d4eddfa 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -87,9 +87,11 @@ def _tolerance(self, rtol, X_table, policy, dtype=np.float32): if rtol == 0.0: return rtol # TODO: Support CSR in Basic Statistics + is_sparse = False dummy = to_table(None) bs = self._get_basic_statistics_backend("variance") - res = bs.compute_raw(X_table, dummy, policy, dtype) + + res = bs.compute_raw(X_table, dummy, policy, dtype, is_sparse) mean_var = from_table(res["variance"]).mean() return mean_var * rtol diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 0f81cda78f..d1dedba81c 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -21,6 +21,8 @@ from daal4py.sklearn._utils import make2d from onedal import _backend, _is_dpc_backend +from ..utils import _is_csr + try: import dpctl import dpctl.tensor as dpt @@ -44,7 +46,9 @@ def convert_one_to_table(arg): if dpctl_available: if isinstance(arg, dpt.usm_ndarray): return _backend.dpctl_to_table(arg) - arg = make2d(arg) + + if not _is_csr(arg): + arg = make2d(arg) return _backend.to_table(arg) diff --git a/onedal/utils/__init__.py b/onedal/utils/__init__.py index dbb1f57997..0a1b05fbc2 100644 --- a/onedal/utils/__init__.py +++ b/onedal/utils/__init__.py @@ -22,6 +22,7 @@ _column_or_1d, _is_arraylike, _is_arraylike_not_scalar, + _is_csr, _is_integral_float, _is_multilabel, _num_features, @@ -44,4 +45,5 @@ "_num_samples", "_is_arraylike", "_is_arraylike_not_scalar", + "_is_csr", ] diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index bc9f07db4e..251d12bce3 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -419,3 +419,10 @@ def _num_samples(x): return len(x) except TypeError as type_error: raise TypeError(message) from type_error + + +def _is_csr(x): + """Return True if x is scipy.sparse.csr_matrix or scipy.sparse.csr_array""" + return isinstance(x, sp.csr_matrix) or ( + hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) + ) From 772f678e53c501059cf3c83c403ff05c4e2983f7 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 12 Jun 2024 16:55:22 +0200 Subject: [PATCH 22/75] [testing] Re-enable sklearn test_common::test_pandas_column_name_consistency test (#1848) * Update deselected_tests.yaml * Update _forest.py * Update dbscan.py * Update dbscan.py * Update incremental_linear.py * formatting * Update _ridge.py * formatting * Update _ridge.py * formatting * validate data fix * Update _ridge.py * Update incremental_linear.py * Update incremental_linear.py * reformatting * Update _forest.py * Update _forest.py * formatting * Update _ridge.py * Update _forest.py * Update _ridge.py * Update _forest.py * Update forest.py * Update deselected_tests.yaml * formatting * oob_score patch * Update deselected_tests.yaml * lasso and elastic net fixes * reactivate tests for observation * fix in _fit * Update deselected_tests.yaml * Update incremental_linear.py * formatting * Update _forest.py * formatting * Update incremental_linear.py --- .../linear_model/_coordinate_descent.py | 30 ++++---- daal4py/sklearn/linear_model/_ridge.py | 58 ++++++++++------ deselected_tests.yaml | 11 +-- onedal/ensemble/forest.py | 4 +- sklearnex/cluster/dbscan.py | 3 + sklearnex/ensemble/_forest.py | 68 +++++++++++++------ sklearnex/linear_model/incremental_linear.py | 64 +++++++++-------- 7 files changed, 146 insertions(+), 92 deletions(-) diff --git a/daal4py/sklearn/linear_model/_coordinate_descent.py b/daal4py/sklearn/linear_model/_coordinate_descent.py index a35baade57..93f5472db5 100755 --- a/daal4py/sklearn/linear_model/_coordinate_descent.py +++ b/daal4py/sklearn/linear_model/_coordinate_descent.py @@ -435,9 +435,9 @@ def _daal4py_predict_lasso(self, X): return res -def _fit(self, X, y, sample_weight=None, check_input=True): +def _fit(self, _X, _y, sample_weight=None, check_input=True): if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=True) + self._check_feature_names(_X, reset=True) if sklearn_check_version("1.2"): self._validate_params() elif sklearn_check_version("1.1"): @@ -474,15 +474,17 @@ def _fit(self, X, y, sample_weight=None, check_input=True): # check X and y if check_input: X, y = check_X_y( - X, - y, + _X, + _y, copy=False, accept_sparse="csc", dtype=[np.float64, np.float32], multi_output=True, y_numeric=True, ) - y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False) + y = check_array(_y, copy=False, dtype=X.dtype.type, ensure_2d=False) + else: + X, y = _X, _y if not sp.issparse(X): self.fit_shape_good_for_daal_ = ( @@ -556,7 +558,7 @@ def _fit(self, X, y, sample_weight=None, check_input=True): del self.daal_model_ logging.info(_function_name + ": " + get_patch_message("sklearn_after_daal")) res_new = super(class_inst, self).fit( - X, y, sample_weight=sample_weight, check_input=check_input + _X, _y, sample_weight=sample_weight, check_input=check_input ) self._gap = res_new.dual_gap_ return res_new @@ -695,11 +697,11 @@ def predict(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - X = check_array( + _X = check_array( X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float64, np.float32] ) good_shape_for_daal = ( - True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False + True if _X.ndim <= 1 else True if _X.shape[0] >= _X.shape[1] else False ) _patching_status = PatchingConditionsChain( @@ -708,7 +710,7 @@ def predict(self, X): _dal_ready = _patching_status.and_conditions( [ (hasattr(self, "daal_model_"), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (not sp.issparse(_X), "X is sparse. Sparse input is not supported."), ( good_shape_for_daal, "The shape of X does not satisfy oneDAL requirements: " @@ -720,7 +722,7 @@ def predict(self, X): if not _dal_ready: return self._decision_function(X) - return _daal4py_predict_enet(self, X) + return _daal4py_predict_enet(self, _X) @property def dual_gap_(self): @@ -814,18 +816,18 @@ def fit(self, X, y, sample_weight=None, check_input=True): def predict(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - X = check_array( + _X = check_array( X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float64, np.float32] ) good_shape_for_daal = ( - True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False + True if _X.ndim <= 1 else True if _X.shape[0] >= _X.shape[1] else False ) _patching_status = PatchingConditionsChain("sklearn.linear_model.Lasso.predict") _dal_ready = _patching_status.and_conditions( [ (hasattr(self, "daal_model_"), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (not sp.issparse(_X), "X is sparse. Sparse input is not supported."), ( good_shape_for_daal, "The shape of X does not satisfy oneDAL requirements: " @@ -837,7 +839,7 @@ def predict(self, X): if not _dal_ready: return self._decision_function(X) - return _daal4py_predict_lasso(self, X) + return _daal4py_predict_lasso(self, _X) @property def dual_gap_(self): diff --git a/daal4py/sklearn/linear_model/_ridge.py b/daal4py/sklearn/linear_model/_ridge.py index 7a49938013..7718d91605 100644 --- a/daal4py/sklearn/linear_model/_ridge.py +++ b/daal4py/sklearn/linear_model/_ridge.py @@ -101,13 +101,11 @@ def _daal4py_predict(self, X): return res -def _fit_ridge(self, X, y, sample_weight=None): +def _fit_ridge(self, _X, _y, sample_weight=None): if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): self._normalize = _deprecate_normalize( self.normalize, default=False, estimator_name=self.__class__.__name__ ) - if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() elif sklearn_check_version("1.1"): @@ -125,15 +123,27 @@ def _fit_ridge(self, X, y, sample_weight=None): include_boundaries="left", ) - X, y = check_X_y( - X, - y, - ["csr", "csc", "coo"], - dtype=[np.float64, np.float32], - multi_output=True, - y_numeric=True, - ) - self.n_features_in_ = X.shape[1] + if sklearn_check_version("1.0"): + X, y = self._validate_data( + _X, + _y, + accept_sparse=["csr", "csc", "coo"], + dtype=[np.float64, np.float32], + multi_output=True, + y_numeric=True, + ensure_2d=True, + ) + else: + X, y = check_X_y( + _X, + _y, + ["csr", "csc", "coo"], + dtype=[np.float64, np.float32], + multi_output=True, + y_numeric=True, + ) + self.n_features_in_ = X.shape[1] + self.sample_weight_ = sample_weight self.fit_shape_good_for_daal_ = True if X.shape[0] >= X.shape[1] else False @@ -168,7 +178,7 @@ def _fit_ridge(self, X, y, sample_weight=None): if not _dal_ready: if hasattr(self, "daal_model_"): del self.daal_model_ - return super(Ridge, self).fit(X, y, sample_weight=sample_weight) + return super(Ridge, self).fit(_X, _y, sample_weight=sample_weight) self.n_iter_ = None res = _daal4py_fit(self, X, y) if res is None: @@ -177,17 +187,23 @@ def _fit_ridge(self, X, y, sample_weight=None): ) if hasattr(self, "daal_model_"): del self.daal_model_ - return super(Ridge, self).fit(X, y, sample_weight=sample_weight) + return super(Ridge, self).fit(_X, _y, sample_weight=sample_weight) return res -def _predict_ridge(self, X): +def _predict_ridge(self, _X): if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=False) - - X = check_array( - X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float64, np.float32] - ) + X = self._validate_data( + _X, + accept_sparse=["csr", "csc", "coo"], + dtype=[np.float64, np.float32], + reset=False, + ensure_2d=True, + ) + else: + X = check_array( + _X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float64, np.float32] + ) good_shape_for_daal = ( True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False ) @@ -221,7 +237,7 @@ def _predict_ridge(self, X): _patching_status.write_log() if not _dal_ready: - return self._decision_function(X) + return self._decision_function(_X) return _daal4py_predict(self, X) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index cd153b1f71..2efc50f9ef 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -102,11 +102,6 @@ deselected_tests: - utils/tests/test_validation.py::test_check_array_links_to_imputer_doc_only_for_X[asarray-X] - utils/tests/test_validation.py::test_check_array_links_to_imputer_doc_only_for_X[csr_matrix-X] - # Extra warning from scikit-learn-intelex fails scikit-learn warning check - # TODO: investigate - is this warning expected? - - linear_model/tests/test_coordinate_descent.py::test_assure_warning_when_normalize[True-1-ElasticNet] >=1.1,<1.2 - - linear_model/tests/test_coordinate_descent.py::test_assure_warning_when_normalize[True-1-Lasso] >=1.1,<1.2 - # TODO: investigate copy failure of read-only buffer - linear_model/tests/test_coordinate_descent.py::test_read_only_buffer @@ -325,9 +320,6 @@ deselected_tests: # test checks only the exact number of options that are used - tests/test_config.py::test_config_context - # HalvingGridSearchCV with Ridge and PCA didn't have feature_name_in. Need to fix. - - tests/test_common.py::test_pandas_column_name_consistency >=1.0.1 - # Some scikit-learn-intelex docstrings differ from scikit-learn. - tests/test_docstrings.py >=1.0.2 @@ -653,6 +645,9 @@ gpu: # KMeans based (unsupported for GPU) - cluster/tests/test_k_means.py + - tests/test_common.py::test_pandas_column_name_consistency[KMeans()] + - tests/test_common.py::test_pandas_column_name_consistency[GaussianMixture()] + - tests/test_common.py::test_pandas_column_name_consistency[BayesianGaussianMixture()] - tests/test_common.py::test_estimators[KMeans() - tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_fit_check_is_fitted] - tests/test_common.py::test_estimators[GaussianMixture()-check_fit_check_is_fitted] diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index a80d7fad0c..08c689a203 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -315,7 +315,7 @@ def _fit(self, X, y, sample_weight, module, queue): if self.oob_score: if isinstance(self, ClassifierMixin): - self.oob_score_ = from_table(train_result.oob_err_accuracy)[0, 0] + self.oob_score_ = from_table(train_result.oob_err_accuracy).item() self.oob_decision_function_ = from_table( train_result.oob_err_decision_function ) @@ -327,7 +327,7 @@ def _fit(self, X, y, sample_weight, module, queue): UserWarning, ) else: - self.oob_score_ = from_table(train_result.oob_err_r2)[0, 0] + self.oob_score_ = from_table(train_result.oob_err_r2).item() self.oob_prediction_ = from_table( train_result.oob_err_prediction ).reshape(-1) diff --git a/sklearnex/cluster/dbscan.py b/sklearnex/cluster/dbscan.py index 0936a4647a..db67f7cf6b 100755 --- a/sklearnex/cluster/dbscan.py +++ b/sklearnex/cluster/dbscan.py @@ -85,6 +85,9 @@ def __init__( self.n_jobs = n_jobs def _onedal_fit(self, X, y, sample_weight=None, queue=None): + if sklearn_check_version("1.0"): + X = self._validate_data(X, force_all_finite=False) + onedal_params = { "eps": self.eps, "min_samples": self.min_samples, diff --git a/sklearnex/ensemble/_forest.py b/sklearnex/ensemble/_forest.py index 447a233dcd..c5e4369715 100644 --- a/sklearnex/ensemble/_forest.py +++ b/sklearnex/ensemble/_forest.py @@ -38,7 +38,7 @@ ) from sklearn.tree._tree import Tree from sklearn.utils import check_random_state, deprecated -from sklearn.utils.validation import check_array, check_is_fitted +from sklearn.utils.validation import check_array, check_is_fitted, check_X_y from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import ( @@ -74,6 +74,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): accept_sparse=False, dtype=[np.float64, np.float32], force_all_finite=False, + ensure_2d=True, ) if sample_weight is not None: @@ -97,8 +98,6 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): y, expanded_class_weight = self._validate_y_class_weight(y) - self.n_features_in_ = X.shape[1] - if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight @@ -559,7 +558,7 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): ) if patching_status.get_status(): - X, y = self._validate_data( + X, y = check_X_y( X, y, multi_output=True, @@ -779,6 +778,10 @@ def _onedal_gpu_supported(self, method_name, *data): or self.estimator.__class__ == DecisionTreeClassifier, "ExtraTrees only supported starting from oneDAL version 2023.1", ), + ( + not self.oob_score, + "oob_scores using r2 or accuracy not implemented.", + ), (sample_weight is None, "sample_weight is not supported."), ] ) @@ -821,24 +824,43 @@ def _onedal_predict(self, X, queue=None): check_is_fitted(self, "_onedal_estimator") if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=False) - - X = check_array( - X, - dtype=[np.float64, np.float32], - force_all_finite=False, - ) # Warning, order of dtype matters + X = self._validate_data( + X, + dtype=[np.float64, np.float32], + force_all_finite=False, + reset=False, + ensure_2d=True, + ) + else: + X = check_array( + X, + dtype=[np.float64, np.float32], + force_all_finite=False, + ) # Warning, order of dtype matters + self._check_n_features(X, reset=False) res = self._onedal_estimator.predict(X, queue=queue) return np.take(self.classes_, res.ravel().astype(np.int64, casting="unsafe")) def _onedal_predict_proba(self, X, queue=None): - X = check_array(X, dtype=[np.float64, np.float32], force_all_finite=False) check_is_fitted(self, "_onedal_estimator") - self._check_n_features(X, reset=False) if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=False) + X = self._validate_data( + X, + dtype=[np.float64, np.float32], + force_all_finite=False, + reset=False, + ensure_2d=True, + ) + else: + X = check_array( + X, + dtype=[np.float64, np.float32], + force_all_finite=False, + ) # Warning, order of dtype matters + self._check_n_features(X, reset=False) + return self._onedal_estimator.predict_proba(X, queue=queue) def _onedal_score(self, X, y, sample_weight=None, queue=None): @@ -955,7 +977,7 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): ) if patching_status.get_status(): - X, y = self._validate_data( + X, y = check_X_y( X, y, multi_output=True, @@ -1087,6 +1109,7 @@ def _onedal_gpu_supported(self, method_name, *data): or self.estimator.__class__ == DecisionTreeClassifier, "ExtraTrees only supported starting from oneDAL version 2023.1", ), + (not self.oob_score, "oob_score value is not sklearn conformant."), (sample_weight is None, "sample_weight is not supported."), ] ) @@ -1124,13 +1147,20 @@ def _onedal_gpu_supported(self, method_name, *data): return patching_status def _onedal_predict(self, X, queue=None): - X = check_array( - X, dtype=[np.float64, np.float32], force_all_finite=False - ) # Warning, order of dtype matters check_is_fitted(self, "_onedal_estimator") if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=False) + X = self._validate_data( + X, + dtype=[np.float64, np.float32], + force_all_finite=False, + reset=False, + ensure_2d=True, + ) # Warning, order of dtype matters + else: + X = check_array( + X, dtype=[np.float64, np.float32], force_all_finite=False + ) # Warning, order of dtype matters return self._onedal_estimator.predict(X, queue=queue) diff --git a/sklearnex/linear_model/incremental_linear.py b/sklearnex/linear_model/incremental_linear.py index fdcc418f3f..85f790821d 100644 --- a/sklearnex/linear_model/incremental_linear.py +++ b/sklearnex/linear_model/incremental_linear.py @@ -135,6 +135,7 @@ def _onedal_predict(self, X, queue=None): X, dtype=[np.float64, np.float32], copy=self.copy_X, + reset=False, ) else: X = check_array( @@ -153,36 +154,37 @@ def _onedal_score(self, X, y, sample_weight=None, queue=None): y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight ) - def _onedal_partial_fit(self, X, y, queue=None): + def _onedal_partial_fit(self, X, y, check_input=True, queue=None): first_pass = not hasattr(self, "n_samples_seen_") or self.n_samples_seen_ == 0 if sklearn_check_version("1.2"): self._validate_params() - if sklearn_check_version("1.0"): - X, y = self._validate_data( - X, - y, - dtype=[np.float64, np.float32], - reset=first_pass, - copy=self.copy_X, - multi_output=True, - force_all_finite=False, - ) - else: - X = check_array( - X, - dtype=[np.float64, np.float32], - copy=self.copy_X, - force_all_finite=False, - ) - y = check_array( - y, - dtype=[np.float64, np.float32], - copy=False, - ensure_2d=False, - force_all_finite=False, - ) + if check_input: + if sklearn_check_version("1.0"): + X, y = self._validate_data( + X, + y, + dtype=[np.float64, np.float32], + reset=first_pass, + copy=self.copy_X, + multi_output=True, + force_all_finite=False, + ) + else: + X = check_array( + X, + dtype=[np.float64, np.float32], + copy=self.copy_X, + force_all_finite=False, + ) + y = check_array( + y, + dtype=[np.float64, np.float32], + copy=False, + ensure_2d=False, + force_all_finite=False, + ) if first_pass: self.n_samples_seen_ = X.shape[0] @@ -211,7 +213,12 @@ def _onedal_fit(self, X, y, queue=None): if sklearn_check_version("1.0"): X, y = self._validate_data( - X, y, dtype=[np.float64, np.float32], copy=self.copy_X, multi_output=True + X, + y, + dtype=[np.float64, np.float32], + copy=self.copy_X, + multi_output=True, + ensure_2d=True, ) else: X = check_array( @@ -243,7 +250,7 @@ def _onedal_fit(self, X, y, queue=None): for batch in gen_batches(n_samples, self.batch_size_): X_batch, y_batch = X[batch], y[batch] - self._onedal_partial_fit(X_batch, y_batch, queue=queue) + self._onedal_partial_fit(X_batch, y_batch, check_input=False, queue=queue) if sklearn_check_version("1.2"): self._validate_params() @@ -297,7 +304,7 @@ def set_coef_(self, value): coef_ = property(get_coef_, set_coef_) intercept_ = property(get_intercept_, set_intercept_) - def partial_fit(self, X, y): + def partial_fit(self, X, y, check_input=True): """ Incremental fit linear model with X and y. All of X and y is processed as a single batch. @@ -327,6 +334,7 @@ def partial_fit(self, X, y): }, X, y, + check_input=check_input, ) return self From 7c77d8715c7317b214bb75710192e8d3d681c228 Mon Sep 17 00:00:00 2001 From: Victoriya Fedotova Date: Thu, 13 Jun 2024 14:13:58 +0200 Subject: [PATCH 23/75] CI: deselect failing sparse pandas test (#1862) Following test is deselected: utils/tests/test_validation.py::test_check_sparse_pandas_sp_format Because it fails in both sklearn (1.2 and below) and sklearnex. --- deselected_tests.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 2efc50f9ef..7b419cee38 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -259,7 +259,7 @@ deselected_tests: - preprocessing/tests/test_discretization.py::test_nonuniform_strategies[kmeans-expected_2bins1-expected_3bins1-expected_5bins1] >=0.24 # OOB scores in scikit-learn and oneDAL are different because of different random number generators - - ensemble/tests/test_forest.py::test_forest_classifier_oob[X1-y1-0.65-array-ExtraTreesClassifier] + - ensemble/tests/test_forest.py::test_forest_classifier_oob[X1-y1-0.65-array-ExtraTreesClassifier] - ensemble/tests/test_forest.py::test_forest_classifier_oob[True-X1-y1-0.65-array-ExtraTreesClassifier] >=1.3 - ensemble/tests/test_forest.py::test_forest_regressor_oob[True-X0-y0-0.7-array-ExtraTreesRegressor] >=1.3 - ensemble/tests/test_forest.py::test_forest_regressor_oob[X0-y0-0.7-array-RandomForestRegressor] >=1.2 darwin @@ -337,6 +337,7 @@ deselected_tests: - ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py::test_same_predictions_multiclass_classification >=0.24,<1.0 - ensemble/tests/test_gradient_boosting.py::test_gradient_boosting_with_init_pipeline >=0.24,<1.0 - utils/tests/test_validation.py::test_check_array_pandas_dtype_casting >=1.0,<1.2 + - utils/tests/test_validation.py::test_check_sparse_pandas_sp_format <1.2 # Failure due to non-uniformity in the MT2203 engine causing # bad Random Forest fits for small datasets with large n_estimators @@ -752,7 +753,7 @@ gpu: - manifold/tests/test_t_sne.py::test_gradient_bh_multithread_match_sequential - neighbors/tests/test_kde.py::test_kernel_density_sampling - tests/test_common.py::test_check_n_features_in_after_fitting[NearestNeighbors()] - - tests/test_common.py::test_estimators[NearestNeighbors()] + - tests/test_common.py::test_estimators[NearestNeighbors()] - model_selection/tests/test_search.py::test_search_cv_score_samples_method[search_cv0] - model_selection/tests/test_search.py::test_search_cv_score_samples_method[search_cv1] - manifold/tests/test_t_sne.py::test_barnes_hut_angle @@ -1104,7 +1105,7 @@ gpu: - neighbors/tests/test_neighbors.py::test_auto_algorithm - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend - svm/tests/test_sparse.py::test_consistent_proba - - svm/tests/test_svm.py::test_consistent_proba + - svm/tests/test_svm.py::test_consistent_proba - svm/tests/test_svm.py::test_libsvm_parameters - svm/tests/test_svm.py::test_negative_weight_equal_coeffs - svm/tests/test_svm.py::test_unicode_kernel From 10fa7639d5ea65e7be54fbafd4cebf8525458d06 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Thu, 13 Jun 2024 08:25:40 -0700 Subject: [PATCH 24/75] Update dependency cmake to v3.29.5.1 (#1866) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- dependencies-dev | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependencies-dev b/dependencies-dev index 9156827cc9..6dcd97bb32 100644 --- a/dependencies-dev +++ b/dependencies-dev @@ -3,4 +3,4 @@ Jinja2==3.1.4 numpy==1.19.5 ; python_version < '3.9' numpy==2.0.0rc2 ; python_version >= '3.9' pybind11==2.12.0 -cmake==3.29.5 +cmake==3.29.5.1 From 315f5cda21b7920e5174e4f144ae48969044b22d Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 17 Jun 2024 14:41:04 +0100 Subject: [PATCH 25/75] Update dependency lightgbm to v4.4.0 (#1868) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 247ab2be08..4e28f6c887 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -10,7 +10,7 @@ pandas==2.0.3 ; python_version == '3.8' pandas==2.1.3 ; python_version >= '3.9' and python_version < '3.11' pandas==2.2.2 ; python_version >= '3.11' xgboost==2.0.3 -lightgbm==4.3.0 +lightgbm==4.4.0 catboost==1.2.5 ; python_version < '3.12' # TODO: Remove 3.12 condition when catboost supports numpy 2.0 shap==0.44.1 ; python_version == '3.8' shap==0.45.1 ; python_version >= '3.9' and python_version < '3.12' # TODO: Remove 3.12 condition when shap/numba support numpy 2.0 From e1478c3208333a6c4d0fd89d43122ce5d27c9f4c Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 17 Jun 2024 19:38:48 +0200 Subject: [PATCH 26/75] [bug] fix windows build failure for Azure Pipelines (#1871) * Update build-and-test-win.yml * Update dispatch_utils.hpp * Update dispatch_utils.hpp --- .ci/pipeline/build-and-test-win.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.ci/pipeline/build-and-test-win.yml b/.ci/pipeline/build-and-test-win.yml index 07dd3a5d6f..0d1ae6fd89 100644 --- a/.ci/pipeline/build-and-test-win.yml +++ b/.ci/pipeline/build-and-test-win.yml @@ -36,6 +36,7 @@ steps: set PREFIX=%CONDA_PREFIX% set PYTHON=python call conda-recipe\bld.bat + IF %ERRORLEVEL% neq 0 EXIT /b %ERRORLEVEL% set DALROOT=%CONDA_PREFIX% python setup_sklearnex.py install --single-version-externally-managed --record=record_sklearnex.txt displayName: 'Build daal4py/sklearnex' From e931d5011ab7487e767055ca28a961f78333176a Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 17 Jun 2024 15:38:29 -0700 Subject: [PATCH 27/75] Update dependency numpy to v2.0.0 (#1870) * Update dependency numpy to v2.0.0 * Update dependencies-dev * update reqs.txt * Change catboost installation condition * Update requirements-test.txt --------- Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Co-authored-by: ethanglaser Co-authored-by: Alexander Andreev --- dependencies-dev | 2 +- requirements-test.txt | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dependencies-dev b/dependencies-dev index 6dcd97bb32..fdf51e12b3 100644 --- a/dependencies-dev +++ b/dependencies-dev @@ -1,6 +1,6 @@ Cython==3.0.10 Jinja2==3.1.4 numpy==1.19.5 ; python_version < '3.9' -numpy==2.0.0rc2 ; python_version >= '3.9' +numpy==2.0.0 ; python_version >= '3.9' pybind11==2.12.0 cmake==3.29.5.1 diff --git a/requirements-test.txt b/requirements-test.txt index 4e28f6c887..1d8b98ce9d 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -3,7 +3,7 @@ pytest==8.2.2 ; python_version >= '3.10' numpy>=1.19.5 ; python_version <= '3.9' numpy>=1.21.6 ; python_version == '3.10' numpy>=1.23.5 ; python_version == '3.11' -numpy>=2.0.0rc2 ; python_version >= '3.12' +numpy>=2.0.0 ; python_version >= '3.12' scikit-learn==1.2.2 ; python_version == '3.8' scikit-learn==1.5.0 ; python_version >= '3.9' pandas==2.0.3 ; python_version == '3.8' @@ -11,7 +11,7 @@ pandas==2.1.3 ; python_version >= '3.9' and python_version < '3.11' pandas==2.2.2 ; python_version >= '3.11' xgboost==2.0.3 lightgbm==4.4.0 -catboost==1.2.5 ; python_version < '3.12' # TODO: Remove 3.12 condition when catboost supports numpy 2.0 +catboost==1.2.5 ; python_version < '3.11' # TODO: Remove 3.11 condition when catboost supports numpy 2.0 shap==0.44.1 ; python_version == '3.8' -shap==0.45.1 ; python_version >= '3.9' and python_version < '3.12' # TODO: Remove 3.12 condition when shap/numba support numpy 2.0 +shap==0.45.1 ; python_version >= '3.9' and python_version < '3.11' # TODO: Remove 3.12 condition when shap/numba support numpy 2.0 array-api-strict==1.1.1 ; python_version >= '3.9' From 60f75502a4801dd7e26da4e713974acaf71f4515 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Tue, 18 Jun 2024 11:25:12 -0700 Subject: [PATCH 28/75] Update dependency urllib3 to v2.2.2 [SECURITY] (#1872) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- requirements-doc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-doc.txt b/requirements-doc.txt index d818c1f1ff..c0af1862e7 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -68,7 +68,7 @@ testpath==0.6.0 tornado==6.4.1 traitlets==5.14.1 typing-extensions==4.9.0 -urllib3==2.2.0 +urllib3==2.2.2 wcwidth==0.2.13 webencodings==0.5.1 zipp==3.17.0 From c29af8ececa58c2dcc856d66f16c701cd8b91af6 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 19 Jun 2024 13:36:46 +0200 Subject: [PATCH 29/75] [enhancement] add ClusterMixin and TransformerMixin to `onedal/` (#1837) * Update _mixin.py * Update dbscan.py * Update kmeans.py * Update kmeans.py * formatting --- onedal/cluster/dbscan.py | 6 +++--- onedal/cluster/kmeans.py | 6 +++--- onedal/common/_mixin.py | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/onedal/cluster/dbscan.py b/onedal/cluster/dbscan.py index 7925b932c9..f91325b65c 100644 --- a/onedal/cluster/dbscan.py +++ b/onedal/cluster/dbscan.py @@ -15,13 +15,13 @@ # =============================================================================== import numpy as np -from sklearn.base import ClusterMixin -from sklearn.utils import check_array from daal4py.sklearn._utils import get_dtype, make2d from ..common._base import BaseEstimator +from ..common._mixin import ClusterMixin from ..datatypes import _convert_to_supported, from_table, to_table +from ..utils import _check_array class BaseDBSCAN(BaseEstimator, ClusterMixin): @@ -58,7 +58,7 @@ def _get_onedal_params(self, dtype=np.float32): def _fit(self, X, y, sample_weight, module, queue): policy = self._get_policy(queue, X) - X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) sample_weight = make2d(sample_weight) if sample_weight is not None else None X = make2d(X) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 812d4eddfa..8def0d2234 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -29,15 +29,15 @@ else: from sklearn.cluster import _kmeans_plusplus -from sklearn.base import ClusterMixin, TransformerMixin from sklearn.exceptions import ConvergenceWarning from sklearn.metrics.pairwise import euclidean_distances -from sklearn.utils import check_array, check_random_state +from sklearn.utils import check_random_state from sklearn.utils.validation import check_is_fitted from onedal.basic_statistics import BasicStatistics from ..common._base import BaseEstimator as onedal_BaseEstimator +from ..common._mixin import ClusterMixin, TransformerMixin from ..utils import _check_array, _is_arraylike_not_scalar @@ -264,7 +264,7 @@ def is_better_iteration(inertia, labels): init = self.init init_is_array_like = _is_arraylike_not_scalar(init) if init_is_array_like: - init = check_array(init, dtype=dtype, copy=True, order="C") + init = _check_array(init, dtype=dtype, copy=True, order="C") self._validate_center_shape(X, init) use_custom_init = daal_check_version((2023, "P", 200)) and not callable(self.init) diff --git a/onedal/common/_mixin.py b/onedal/common/_mixin.py index 26b7430768..4e4e751571 100644 --- a/onedal/common/_mixin.py +++ b/onedal/common/_mixin.py @@ -15,6 +15,17 @@ # ============================================================================== +class ClusterMixin: + _estimator_type = "clusterer" + + def fit_predict(self, X, y=None, queue=None, **kwargs): + self.fit(X, queue=queue, **kwargs) + return self.labels_ + + def _more_tags(self): + return {"preserves_dtype": []} + + class ClassifierMixin: _estimator_type = "classifier" @@ -39,3 +50,13 @@ def score(self, X, y, sample_weight=None, queue=None): def _more_tags(self): return {"requires_y": True} + + +class TransformerMixin: + _estimator_type = "transformer" + + def fit_transform(self, X, y=None, queue=None, **fit_params): + if y is None: + return self.fit(X, queue=queue, **fit_params).transform(X, queue=queue) + else: + return self.fit(X, y, queue=queue, **fit_params).transform(X, queue=queue) From 97532f4c1486e44f2da1a407df54188bd11c62d4 Mon Sep 17 00:00:00 2001 From: Samir Nasibli Date: Wed, 19 Jun 2024 23:53:34 +0200 Subject: [PATCH 30/75] MAINT: minor refactoring for LogRegression (#1877) --- sklearnex/linear_model/logistic_regression.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/sklearnex/linear_model/logistic_regression.py b/sklearnex/linear_model/logistic_regression.py index 6e1883f87f..f981282826 100644 --- a/sklearnex/linear_model/logistic_regression.py +++ b/sklearnex/linear_model/logistic_regression.py @@ -21,18 +21,6 @@ from daal4py.sklearn.linear_model.logistic_path import ( LogisticRegression as LogisticRegression_daal4py, ) -from daal4py.sklearn.linear_model.logistic_path import daal4py_fit, daal4py_predict - - -class BaseLogisticRegression(ABC): - def _save_attributes(self): - assert hasattr(self, "_onedal_estimator") - self.classes_ = self._onedal_estimator.classes_ - self.coef_ = self._onedal_estimator.coef_ - self.intercept_ = self._onedal_estimator.intercept_ - self.n_features_in_ = self._onedal_estimator.n_features_in_ - self.n_iter_ = self._onedal_estimator.n_iter_ - if daal_check_version((2024, "P", 1)): import numpy as np @@ -44,6 +32,7 @@ def _save_attributes(self): from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version + from daal4py.sklearn.linear_model.logistic_path import daal4py_fit, daal4py_predict from onedal.linear_model import LogisticRegression as onedal_LogisticRegression from onedal.utils import _num_samples @@ -51,6 +40,15 @@ def _save_attributes(self): from .._utils import PatchingConditionsChain, get_patch_message from ..utils.validation import _assert_all_finite + class BaseLogisticRegression(ABC): + def _save_attributes(self): + assert hasattr(self, "_onedal_estimator") + self.classes_ = self._onedal_estimator.classes_ + self.coef_ = self._onedal_estimator.coef_ + self.intercept_ = self._onedal_estimator.intercept_ + self.n_features_in_ = self._onedal_estimator.n_features_in_ + self.n_iter_ = self._onedal_estimator.n_iter_ + @control_n_jobs( decorated_methods=[ "fit", From 15eb80ee3d6fb688c701e18c250fd5bba84a11e6 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Thu, 20 Jun 2024 08:26:18 -0700 Subject: [PATCH 31/75] Update dependency xgboost to v2.1.0 (#1882) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 1d8b98ce9d..64af54f1bf 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -9,7 +9,7 @@ scikit-learn==1.5.0 ; python_version >= '3.9' pandas==2.0.3 ; python_version == '3.8' pandas==2.1.3 ; python_version >= '3.9' and python_version < '3.11' pandas==2.2.2 ; python_version >= '3.11' -xgboost==2.0.3 +xgboost==2.1.0 lightgbm==4.4.0 catboost==1.2.5 ; python_version < '3.11' # TODO: Remove 3.11 condition when catboost supports numpy 2.0 shap==0.44.1 ; python_version == '3.8' From 1b56189f238e5ee3d7976460ca3b65e36bf3be81 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Fri, 21 Jun 2024 14:10:10 -0700 Subject: [PATCH 32/75] Update dependency cmake to v3.29.6 (#1884) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- dependencies-dev | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependencies-dev b/dependencies-dev index fdf51e12b3..6bc4aedf04 100644 --- a/dependencies-dev +++ b/dependencies-dev @@ -3,4 +3,4 @@ Jinja2==3.1.4 numpy==1.19.5 ; python_version < '3.9' numpy==2.0.0 ; python_version >= '3.9' pybind11==2.12.0 -cmake==3.29.5.1 +cmake==3.29.6 From d0bb9186abe37bce51fe977b2d722bb51bb63368 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 24 Jun 2024 12:38:02 +0200 Subject: [PATCH 33/75] Update dispatcher.py (#1875) --- sklearnex/dispatcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/dispatcher.py b/sklearnex/dispatcher.py index 52b60076fc..2a4a085ac0 100644 --- a/sklearnex/dispatcher.py +++ b/sklearnex/dispatcher.py @@ -64,7 +64,7 @@ def get_patch_map_core(preview=False): sklearn_obj = mapping["kmeans"][0][1] mapping.pop("kmeans") mapping["kmeans"] = [ - [(cluster_module, "kmeans", KMeans_sklearnex), sklearn_obj] + [(cluster_module, "KMeans", KMeans_sklearnex), sklearn_obj] ] # Covariance From 01df2bb15360481441736882d9847d9adf24c811 Mon Sep 17 00:00:00 2001 From: Samir Nasibli Date: Mon, 24 Jun 2024 15:24:16 +0200 Subject: [PATCH 34/75] CI: updated DPCPP compiler version to 2024.2 (#1890) --- .ci/pipeline/build-and-test-lnx.yml | 2 +- .ci/scripts/install_dpcpp.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/pipeline/build-and-test-lnx.yml b/.ci/pipeline/build-and-test-lnx.yml index 4aa56b4e72..f67a7607a4 100644 --- a/.ci/pipeline/build-and-test-lnx.yml +++ b/.ci/pipeline/build-and-test-lnx.yml @@ -24,7 +24,7 @@ steps: displayName: "System info" - script: | conda update -y -q conda - conda create -q -y -n CB -c conda-forge -c intel python=$(PYTHON_VERSION) intel::dal-devel mpich pyyaml "dpcpp-cpp-rt=2024.1.0" + conda create -q -y -n CB -c conda-forge -c intel python=$(PYTHON_VERSION) intel::dal-devel mpich pyyaml "dpcpp-cpp-rt=2024.2.0" displayName: "Conda create" - script: | . /usr/share/miniconda/etc/profile.d/conda.sh diff --git a/.ci/scripts/install_dpcpp.sh b/.ci/scripts/install_dpcpp.sh index 1f45d9770d..0d8f8b690b 100755 --- a/.ci/scripts/install_dpcpp.sh +++ b/.ci/scripts/install_dpcpp.sh @@ -21,5 +21,5 @@ rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list sudo add-apt-repository -y "deb https://apt.repos.intel.com/oneapi all main" sudo apt-get update -sudo apt-get install -y intel-dpcpp-cpp-compiler-2024.1 +sudo apt-get install -y intel-dpcpp-cpp-compiler-2024.2 sudo bash -c 'echo libintelocl.so > /etc/OpenCL/vendors/intel-cpu.icd' From 2ff748adff9a621270bbdad98141e4332c9e1a69 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Tue, 25 Jun 2024 06:49:45 +0200 Subject: [PATCH 35/75] [testing] enable get_queue for `/onedal` testing without dpctl support (#1856) * Update _device_selection.py * Update _device_selection.py * Update _dataframes_support.py * Update test_incremental_linear_regression.py * Update test_linear_regression.py * Update test_random_forest.py * Update test_policy.py * Update test_policy.py * Update test_policy.py * formatting --- onedal/common/tests/test_policy.py | 5 +++++ onedal/ensemble/tests/test_random_forest.py | 2 +- .../linear_model/tests/test_incremental_linear_regression.py | 4 ++-- onedal/linear_model/tests/test_linear_regression.py | 4 ++-- onedal/tests/utils/_dataframes_support.py | 5 +++-- onedal/tests/utils/_device_selection.py | 2 +- 6 files changed, 14 insertions(+), 8 deletions(-) diff --git a/onedal/common/tests/test_policy.py b/onedal/common/tests/test_policy.py index 673bd4cb56..80c5d5321d 100644 --- a/onedal/common/tests/test_policy.py +++ b/onedal/common/tests/test_policy.py @@ -47,6 +47,11 @@ def test_with_numpy_data(queue): @pytest.mark.parametrize("queue", get_queues("cpu,gpu")) @pytest.mark.parametrize("memtype", get_memory_usm()) def test_with_usm_ndarray_data(queue, memtype): + if queue is None: + pytest.skip( + "dpctl Memory object with queue=None uses cached default (gpu if available)" + ) + from dpctl.tensor import usm_ndarray device_name = device_type_to_str(queue) diff --git a/onedal/ensemble/tests/test_random_forest.py b/onedal/ensemble/tests/test_random_forest.py index b4371d839a..2659481662 100644 --- a/onedal/ensemble/tests/test_random_forest.py +++ b/onedal/ensemble/tests/test_random_forest.py @@ -48,7 +48,7 @@ def test_rf_regression(queue): # GPU and CPU implementations of Random Forest use RNGs differently. They build # different ensembles of trees, thereby requiring separate check values. - if queue.sycl_device.is_gpu: + if queue and queue.sycl_device.is_gpu: if daal_check_version((2024, "P", 0)): assert_allclose([1.82], rf.predict([[0, 0, 0, 0]], queue=queue), atol=1e-2) else: diff --git a/onedal/linear_model/tests/test_incremental_linear_regression.py b/onedal/linear_model/tests/test_incremental_linear_regression.py index 20e3a4b09f..0658a51c6b 100644 --- a/onedal/linear_model/tests/test_incremental_linear_regression.py +++ b/onedal/linear_model/tests/test_incremental_linear_regression.py @@ -88,7 +88,7 @@ def test_full_results(queue, num_blocks, dtype): model.partial_fit(X_split[i], y_split[i], queue=queue) model.finalize_fit() - if queue.sycl_device.is_gpu: + if queue and queue.sycl_device.is_gpu: tol = 5e-3 if model.coef_.dtype == np.float32 else 1e-5 else: tol = 2e-3 if model.coef_.dtype == np.float32 else 1e-5 @@ -129,7 +129,7 @@ def test_no_intercept_results(queue, num_blocks, dtype): model.finalize_fit() # TODO Find out is it necessary to have accuracy so different for float32 and float64 - if queue.sycl_device.is_gpu: + if queue and queue.sycl_device.is_gpu: tol = 3e-3 if model.coef_.dtype == np.float32 else 1e-7 else: tol = 2e-3 if model.coef_.dtype == np.float32 else 1e-7 diff --git a/onedal/linear_model/tests/test_linear_regression.py b/onedal/linear_model/tests/test_linear_regression.py index 0e9dd25845..e2dd7ce6d0 100755 --- a/onedal/linear_model/tests/test_linear_regression.py +++ b/onedal/linear_model/tests/test_linear_regression.py @@ -76,7 +76,7 @@ def test_full_results(queue, dtype): model = LinearRegression(fit_intercept=True) model.fit(X, y, queue=queue) - if queue.sycl_device.is_gpu: + if queue and queue.sycl_device.is_gpu: tol = 5e-3 if model.coef_.dtype == np.float32 else 1e-5 else: tol = 2e-3 if model.coef_.dtype == np.float32 else 1e-5 @@ -110,7 +110,7 @@ def test_no_intercept_results(queue, dtype): model = LinearRegression(fit_intercept=False) model.fit(X, y, queue=queue) - if queue.sycl_device.is_gpu: + if queue and queue.sycl_device.is_gpu: tol = 3e-3 if model.coef_.dtype == np.float32 else 1e-7 else: tol = 2e-3 if model.coef_.dtype == np.float32 else 1e-7 diff --git a/onedal/tests/utils/_dataframes_support.py b/onedal/tests/utils/_dataframes_support.py index cfc40ae021..f6ffca4341 100644 --- a/onedal/tests/utils/_dataframes_support.py +++ b/onedal/tests/utils/_dataframes_support.py @@ -69,8 +69,9 @@ def get_dataframes_and_queues( def get_df_and_q(dataframe: str): df_and_q = [] for queue in get_queues(device_filter_): - id = "{}-{}".format(dataframe, queue.id) - df_and_q.append(pytest.param(dataframe, queue.values[0], id=id)) + if queue: + id = "{}-{}".format(dataframe, queue.id) + df_and_q.append(pytest.param(dataframe, queue.values[0], id=id)) return df_and_q if dpctl_available and "dpctl" in dataframe_filter_: diff --git a/onedal/tests/utils/_device_selection.py b/onedal/tests/utils/_device_selection.py index fbb8cd5214..dcc3236e88 100644 --- a/onedal/tests/utils/_device_selection.py +++ b/onedal/tests/utils/_device_selection.py @@ -20,7 +20,7 @@ def get_queues(filter_="cpu,gpu"): - queues = [] + queues = [None] if "cpu" in filter_ else [] try: import dpctl From 776b9243a44d70cdbe56777cb6ae3f67c61e9ef7 Mon Sep 17 00:00:00 2001 From: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Date: Tue, 25 Jun 2024 14:06:29 -0700 Subject: [PATCH 36/75] CI: additional extratrees deselections (#1885) --- deselected_tests.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 7b419cee38..9976dc9b9a 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -734,6 +734,7 @@ gpu: - tests/test_common.py::test_estimators[ExtraTreesClassifier()-check_classifiers_train(readonly_memmap=True)] - tests/test_common.py::test_estimators[ExtraTreesClassifier()-check_fit_idempotent] - tests/test_common.py::test_estimators[ExtraTreesRegressor()-check_fit_idempotent] + - tests/test_common.py::test_estimators[ExtraTreesRegressor()-check_regressor_data_not_an_array] # GPU implementation of Extra Trees doesn't support sample_weights # comparisons to GPU with sample weights will use different algorithms From 772fb9b80cb0232d9336af5f51f1b4e4c66be1fd Mon Sep 17 00:00:00 2001 From: Samir Nasibli Date: Wed, 26 Jun 2024 12:17:50 +0200 Subject: [PATCH 37/75] TEST: enable import tests for dataframes testing in `sklearnex.cluster.DBSCAN` (#1886) * TEST: enabled import tests for dataframes testing in `sklearnex.cluster.DBSCAN` --- sklearnex/cluster/tests/test_dbscan.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearnex/cluster/tests/test_dbscan.py b/sklearnex/cluster/tests/test_dbscan.py index 0313082a1b..a83b5b7cec 100755 --- a/sklearnex/cluster/tests/test_dbscan.py +++ b/sklearnex/cluster/tests/test_dbscan.py @@ -18,16 +18,18 @@ import pytest from numpy.testing import assert_allclose +from onedal.tests.utils._dataframes_support import ( + _convert_to_dataframe, + get_dataframes_and_queues, +) -# TODO: -# adding this parameterized testing -# somehow breaks other test with preview module patch: -# sklearnex/tests/test_monkeypatch.py::test_preview_namespace. -# @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) -def test_sklearnex_import_dbscan(): + +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +def test_sklearnex_import_dbscan(dataframe, queue): from sklearnex.cluster import DBSCAN X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]]) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) dbscan = DBSCAN(eps=3, min_samples=2).fit(X) assert "sklearnex" in dbscan.__module__ From e3694002c7f2919254a4559432405a3fab8147ed Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Wed, 26 Jun 2024 06:44:11 -0700 Subject: [PATCH 38/75] Update dependency importlib-metadata to v8 (#1895) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- requirements-doc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-doc.txt b/requirements-doc.txt index c0af1862e7..17abb1a46c 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -14,7 +14,7 @@ docutils~=0.18.1 entrypoints==0.4 idna==3.7 imagesize==1.4.1 -importlib-metadata==7.0.1 +importlib-metadata==8.0.0 importlib-resources==6.1.1 ipython==8.13.0 ipython-genutils==0.2.0 From a7558ecacfa3a0011e894e7d1965b704a6e0e9aa Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Wed, 26 Jun 2024 06:45:34 -0700 Subject: [PATCH 39/75] Update dependency pybind11 to v2.13.0 (#1897) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- dependencies-dev | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependencies-dev b/dependencies-dev index 6bc4aedf04..650c1f5eb1 100644 --- a/dependencies-dev +++ b/dependencies-dev @@ -2,5 +2,5 @@ Cython==3.0.10 Jinja2==3.1.4 numpy==1.19.5 ; python_version < '3.9' numpy==2.0.0 ; python_version >= '3.9' -pybind11==2.12.0 +pybind11==2.13.0 cmake==3.29.6 From 0128b2b18e24d505d07caf4ae80059b918ea4202 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Thu, 27 Jun 2024 13:01:41 +0200 Subject: [PATCH 40/75] [testing] sklearn 1.5 conformance update (#1896) * Update deselected_tests.yaml * attempt at fixing GPU input error * Update logistic_regression.py * Update deselected_tests.yaml * Update logistic_regression.py * Update logistic_regression.py * Update logistic_regression.py * Update logistic_regression.py * formatting * Update logistic_regression.py * Update logistic_regression.py --- deselected_tests.yaml | 8 +++ sklearnex/linear_model/logistic_regression.py | 57 ++++++++++--------- 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 9976dc9b9a..b1005cfa37 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -380,6 +380,12 @@ deselected_tests: # There are not enough data to run onedal backend - tests/test_common.py::test_estimators[IncrementalLinearRegression()-check_fit2d_1sample] + # Deselection of LogisticRegression tests over accuracy comparisons with sample_weights + # and without. Because scikit-learn-intelex does not support sample_weights, it's doing + # a fallback to scikit-learn in one case and not in the other, and needs to be investigated. + - model_selection/tests/test_classification_threshold.py::test_fit_and_score_over_thresholds_sample_weight >=1.5 + - model_selection/tests/test_classification_threshold.py::test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence >=1.5 + # -------------------------------------------------------- # No need to test daal4py patching reduced_tests: @@ -568,6 +574,7 @@ gpu: - tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_estimators_nan_inf] - tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_estimators_overwrite_params] - tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_estimators_pickle] + - tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_estimators_pickle(readonly_memmap=True)] - tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_methods_sample_order_invariance] - tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_methods_subset_invariance] - tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_fit2d_1feature] @@ -587,6 +594,7 @@ gpu: - tests/test_common.py::test_estimators[GaussianMixture()-check_estimators_nan_inf] - tests/test_common.py::test_estimators[GaussianMixture()-check_estimators_overwrite_params] - tests/test_common.py::test_estimators[GaussianMixture()-check_estimators_pickle] + - tests/test_common.py::test_estimators[GaussianMixture()-check_estimators_pickle(readonly_memmap=True)] - tests/test_common.py::test_estimators[GaussianMixture()-check_methods_sample_order_invariance] - tests/test_common.py::test_estimators[GaussianMixture()-check_methods_subset_invariance] - tests/test_common.py::test_estimators[GaussianMixture()-check_fit2d_1feature] diff --git a/sklearnex/linear_model/logistic_regression.py b/sklearnex/linear_model/logistic_regression.py index f981282826..3f7a23bd5e 100644 --- a/sklearnex/linear_model/logistic_regression.py +++ b/sklearnex/linear_model/logistic_regression.py @@ -28,7 +28,7 @@ from sklearn.linear_model import LogisticRegression as sklearn_LogisticRegression from sklearn.metrics import accuracy_score from sklearn.utils.multiclass import type_of_target - from sklearn.utils.validation import check_X_y + from sklearn.utils.validation import check_array, check_is_fitted, check_X_y from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version @@ -107,8 +107,6 @@ def __init__( _onedal_cpu_fit = daal4py_fit def fit(self, X, y, sample_weight=None): - if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() dispatch( @@ -126,8 +124,6 @@ def fit(self, X, y, sample_weight=None): @wrap_output_data def predict(self, X): - if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=False) return dispatch( self, "predict", @@ -140,8 +136,6 @@ def predict(self, X): @wrap_output_data def predict_proba(self, X): - if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=False) return dispatch( self, "predict_proba", @@ -154,8 +148,6 @@ def predict_proba(self, X): @wrap_output_data def predict_log_proba(self, X): - if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=False) return dispatch( self, "predict_log_proba", @@ -168,8 +160,6 @@ def predict_log_proba(self, X): @wrap_output_data def score(self, X, y, sample_weight=None): - if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=False) return dispatch( self, "score", @@ -208,6 +198,11 @@ def _onedal_gpu_fit_supported(self, method_name, *data): f"sklearn.linear_model.{class_name}.fit" ) + target_type = ( + type_of_target(y, input_name="y") + if sklearn_check_version("1.1") + else type_of_target(y) + ) dal_ready = patching_status.and_conditions( [ (self.penalty == "l2", "Only l2 penalty is supported."), @@ -226,7 +221,7 @@ def _onedal_gpu_fit_supported(self, method_name, *data): (self.l1_ratio is None, "l1 ratio is not supported."), (sample_weight is None, "Sample weight is not supported."), ( - type_of_target(y) == "binary", + target_type == "binary", "Only binary classification is supported", ), ] @@ -313,24 +308,17 @@ def _initialize_onedal_estimator(self): } self._onedal_estimator = onedal_LogisticRegression(**onedal_params) - def _onedal_fit(self, X, y, sample_weight, queue=None): + def _onedal_fit(self, X, y, sample_weight=None, queue=None): if queue is None or queue.sycl_device.is_cpu: return self._onedal_cpu_fit(X, y, sample_weight) assert sample_weight is None - check_params = { - "X": X, - "y": y, - "dtype": [np.float64, np.float32], - "accept_sparse": False, - "multi_output": False, - "force_all_finite": True, - } - if sklearn_check_version("1.2"): - X, y = self._validate_data(**check_params) + if sklearn_check_version("1.0"): + X, y = self._validate_data(X, y, dtype=[np.float64, np.float32]) else: - X, y = check_X_y(**check_params) + X, y = check_X_y(X, y, dtype=[np.float64, np.float32]) + self._initialize_onedal_estimator() try: self._onedal_estimator.fit(X, y, queue=queue) @@ -348,7 +336,12 @@ def _onedal_predict(self, X, queue=None): if queue is None or queue.sycl_device.is_cpu: return daal4py_predict(self, X, "computeClassLabels") - X = self._validate_data(X, accept_sparse=False, reset=False) + check_is_fitted(self) + if sklearn_check_version("1.0"): + X = self._validate_data(X, reset=False, dtype=[np.float64, np.float32]) + else: + X = check_array(X, dtype=[np.float64, np.float32]) + assert hasattr(self, "_onedal_estimator") return self._onedal_estimator.predict(X, queue=queue) @@ -356,7 +349,12 @@ def _onedal_predict_proba(self, X, queue=None): if queue is None or queue.sycl_device.is_cpu: return daal4py_predict(self, X, "computeClassProbabilities") - X = self._validate_data(X, accept_sparse=False, reset=False) + check_is_fitted(self) + if sklearn_check_version("1.0"): + X = self._validate_data(X, reset=False, dtype=[np.float64, np.float32]) + else: + X = check_array(X, dtype=[np.float64, np.float32]) + assert hasattr(self, "_onedal_estimator") return self._onedal_estimator.predict_proba(X, queue=queue) @@ -364,7 +362,12 @@ def _onedal_predict_log_proba(self, X, queue=None): if queue is None or queue.sycl_device.is_cpu: return daal4py_predict(self, X, "computeClassLogProbabilities") - X = self._validate_data(X, accept_sparse=False, reset=False) + check_is_fitted(self) + if sklearn_check_version("1.0"): + X = self._validate_data(X, reset=False, dtype=[np.float64, np.float32]) + else: + X = check_array(X, dtype=[np.float64, np.float32]) + assert hasattr(self, "_onedal_estimator") return self._onedal_estimator.predict_log_proba(X, queue=queue) From a040364c0fb2284ab00e8e9611bc5071e8f26d87 Mon Sep 17 00:00:00 2001 From: Samir Nasibli Date: Thu, 27 Jun 2024 13:59:20 +0200 Subject: [PATCH 41/75] BUG: fixing circular import in daal4py/sklearnex device_offloading (#1832) * BUG: fixing circular import in daal4py/sklearnex device_offloading * adding _config for onedal4py, just for exposing some sklearnex's config settings into onedal4py level * remove circular import in daal4py/onedal4py/sklearnex: * removing _device_offloading module from daal4py, since after KMeans OOP #1770 and adding ENH: Moving sklearnex Ridge Regression support to oneDAL #1843 there is no need for GPU offloading via daal4py syc_context. most of device_ofloading functionality moved to onedal4py level. * created _config module in onedal4py, for exposing some sklearnex config setting into onedal4py level, reused it on sklearnex level * sklearnex depends on onedal4py _config and _device_oflload modules. * Added ElasticNet, Lasso, Ridge into sklearnex patching map --- daal4py/sklearn/_device_offload.py | 100 ---------- daal4py/sklearn/cluster/dbscan.py | 3 - daal4py/sklearn/cluster/k_means.py | 8 - daal4py/sklearn/decomposition/_pca.py | 5 - daal4py/sklearn/ensemble/_forest.py | 7 - .../linear_model/_coordinate_descent.py | 12 -- daal4py/sklearn/linear_model/_linear.py | 3 - daal4py/sklearn/linear_model/_ridge.py | 6 - daal4py/sklearn/linear_model/logistic_path.py | 7 - daal4py/sklearn/manifold/_t_sne.py | 3 - daal4py/sklearn/metrics/_pairwise.py | 2 - daal4py/sklearn/metrics/_ranking.py | 2 - daal4py/sklearn/model_selection/_split.py | 2 - daal4py/sklearn/neighbors/_classification.py | 4 - daal4py/sklearn/neighbors/_regression.py | 3 - daal4py/sklearn/neighbors/_unsupervised.py | 2 - daal4py/sklearn/svm/svm.py | 2 - daal4py/sklearn/tree/decision_tree.py | 4 - onedal/_config.py | 53 ++++++ onedal/_device_offload.py | 167 ++++++++++++++--- onedal/common/_policy.py | 27 +-- sklearnex/_config.py | 18 +- sklearnex/_device_offload.py | 177 +----------------- sklearnex/cluster/dbscan.py | 1 - sklearnex/cluster/k_means.py | 6 + .../covariance/incremental_covariance.py | 1 - sklearnex/dispatcher.py | 84 +++++++++ sklearnex/linear_model/coordinate_descent.py | 9 + sklearnex/linear_model/ridge.py | 5 + sklearnex/manifold/t_sne.py | 4 + sklearnex/metrics/pairwise.py | 5 + sklearnex/metrics/ranking.py | 3 + sklearnex/model_selection/split.py | 3 + sklearnex/tests/test_config.py | 4 + 34 files changed, 332 insertions(+), 410 deletions(-) delete mode 100644 daal4py/sklearn/_device_offload.py create mode 100644 onedal/_config.py diff --git a/daal4py/sklearn/_device_offload.py b/daal4py/sklearn/_device_offload.py deleted file mode 100644 index 80d9595396..0000000000 --- a/daal4py/sklearn/_device_offload.py +++ /dev/null @@ -1,100 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from functools import wraps - -try: - from sklearnex._config import get_config - from sklearnex._device_offload import ( - _copy_to_usm, - _get_global_queue, - _transfer_to_host, - ) - - _sklearnex_available = True -except ImportError: - import logging - - logging.warning( - "Device support is limited in daal4py patching. " - "Use Intel(R) Extension for Scikit-learn* " - "for full experience." - ) - _sklearnex_available = False - - -def _get_host_inputs(*args, **kwargs): - q = _get_global_queue() - q, hostargs = _transfer_to_host(q, *args) - q, hostvalues = _transfer_to_host(q, *kwargs.values()) - hostkwargs = dict(zip(kwargs.keys(), hostvalues)) - return q, hostargs, hostkwargs - - -def _extract_usm_iface(*args, **kwargs): - allargs = (*args, *kwargs.values()) - if len(allargs) == 0: - return None - return getattr(allargs[0], "__sycl_usm_array_interface__", None) - - -def _run_on_device(func, queue, obj=None, *args, **kwargs): - def dispatch_by_obj(obj, func, *args, **kwargs): - if obj is not None: - return func(obj, *args, **kwargs) - return func(*args, **kwargs) - - if queue is not None: - from daal4py.oneapi import _get_in_sycl_ctxt, sycl_context - - if _get_in_sycl_ctxt() is False: - host_offload = get_config()["allow_fallback_to_host"] - - with sycl_context( - "gpu" if queue.sycl_device.is_gpu else "cpu", - host_offload_on_fail=host_offload, - ): - return dispatch_by_obj(obj, func, *args, **kwargs) - return dispatch_by_obj(obj, func, *args, **kwargs) - - -def support_usm_ndarray(freefunc=False): - def decorator(func): - def wrapper_impl(obj, *args, **kwargs): - if _sklearnex_available: - usm_iface = _extract_usm_iface(*args, **kwargs) - q, hostargs, hostkwargs = _get_host_inputs(*args, **kwargs) - result = _run_on_device(func, q, obj, *hostargs, **hostkwargs) - if usm_iface is not None and hasattr(result, "__array_interface__"): - return _copy_to_usm(q, result) - return result - return _run_on_device(func, None, obj, *args, **kwargs) - - if freefunc: - - @wraps(func) - def wrapper_free(*args, **kwargs): - return wrapper_impl(None, *args, **kwargs) - - return wrapper_free - - @wraps(func) - def wrapper_with_self(self, *args, **kwargs): - return wrapper_impl(self, *args, **kwargs) - - return wrapper_with_self - - return decorator diff --git a/daal4py/sklearn/cluster/dbscan.py b/daal4py/sklearn/cluster/dbscan.py index ebfb7f7f8a..ac312d7df4 100644 --- a/daal4py/sklearn/cluster/dbscan.py +++ b/daal4py/sklearn/cluster/dbscan.py @@ -24,7 +24,6 @@ import daal4py -from .._device_offload import support_usm_ndarray from .._n_jobs_support import control_n_jobs from .._utils import PatchingConditionsChain, getFPType, make2d, sklearn_check_version @@ -83,7 +82,6 @@ def __init__( self.p = p self.n_jobs = n_jobs - @support_usm_ndarray() def fit(self, X, y=None, sample_weight=None): if sklearn_check_version("1.2"): self._validate_params() @@ -160,7 +158,6 @@ def fit(self, X, y=None, sample_weight=None): return self return super().fit(X, y, sample_weight=sample_weight) - @support_usm_ndarray() def fit_predict(self, X, y=None, sample_weight=None): return super().fit_predict(X, y, sample_weight) diff --git a/daal4py/sklearn/cluster/k_means.py b/daal4py/sklearn/cluster/k_means.py index d95e09b024..103318004e 100755 --- a/daal4py/sklearn/cluster/k_means.py +++ b/daal4py/sklearn/cluster/k_means.py @@ -34,7 +34,6 @@ import daal4py -from .._device_offload import support_usm_ndarray from .._n_jobs_support import control_n_jobs from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version @@ -575,31 +574,24 @@ def __init__( algorithm=algorithm, ) - @support_usm_ndarray() def fit(self, X, y=None, sample_weight=None): return _fit(self, X, y=y, sample_weight=sample_weight) if sklearn_check_version("1.5"): - @support_usm_ndarray() def predict(self, X): return _predict(self, X) else: - @support_usm_ndarray() def predict( self, X, sample_weight="deprecated" if sklearn_check_version("1.3") else None ): return _predict(self, X, sample_weight=sample_weight) - @support_usm_ndarray() def fit_predict(self, X, y=None, sample_weight=None): return super().fit_predict(X, y, sample_weight) - score = support_usm_ndarray()(KMeans_original.score) - fit.__doc__ = KMeans_original.fit.__doc__ predict.__doc__ = KMeans_original.predict.__doc__ fit_predict.__doc__ = KMeans_original.fit_predict.__doc__ - score.__doc__ = KMeans_original.score.__doc__ diff --git a/daal4py/sklearn/decomposition/_pca.py b/daal4py/sklearn/decomposition/_pca.py index 0eb4d90b4c..deabba7e5c 100644 --- a/daal4py/sklearn/decomposition/_pca.py +++ b/daal4py/sklearn/decomposition/_pca.py @@ -25,7 +25,6 @@ import daal4py -from .._device_offload import support_usm_ndarray from .._n_jobs_support import control_n_jobs from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version @@ -376,7 +375,6 @@ def _transform_daal4py(self, X, whiten=False, scale_eigenvalues=True, check_X=Tr if sklearn_check_version("1.3"): - @support_usm_ndarray() @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model with X. @@ -400,7 +398,6 @@ def fit(self, X, y=None): else: - @support_usm_ndarray() def fit(self, X, y=None): """Fit the model with X. @@ -431,7 +428,6 @@ def fit(self, X, y=None): self._fit(X) return self - @support_usm_ndarray() def transform(self, X): """ Apply dimensionality reduction to X. @@ -466,7 +462,6 @@ def transform(self, X): ) return PCA_original.transform(self, X) - @support_usm_ndarray() def fit_transform(self, X, y=None): """ Fit the model with X and apply the dimensionality reduction on X. diff --git a/daal4py/sklearn/ensemble/_forest.py b/daal4py/sklearn/ensemble/_forest.py index 6e4524a9d3..e3dc4b9d85 100755 --- a/daal4py/sklearn/ensemble/_forest.py +++ b/daal4py/sklearn/ensemble/_forest.py @@ -14,7 +14,6 @@ # limitations under the License. # ============================================================================== -import logging import numbers import warnings from math import ceil @@ -43,7 +42,6 @@ sklearn_check_version, ) -from .._device_offload import support_usm_ndarray from .._n_jobs_support import control_n_jobs from ..utils.validation import _daal_num_features @@ -400,7 +398,6 @@ def __init__( self.minBinSize = minBinSize self.binningStrategy = binningStrategy - @support_usm_ndarray() def fit(self, X, y, sample_weight=None): """ Build a forest of trees from the training set (X, y). @@ -530,7 +527,6 @@ def fit(self, X, y, sample_weight=None): return self return super().fit(X, y, sample_weight=sample_weight) - @support_usm_ndarray() def predict(self, X): """ Predict class for X. @@ -582,7 +578,6 @@ def predict(self, X): ) return self._daal_predict_classifier(X) - @support_usm_ndarray() def predict_proba(self, X): """ Predict class probabilities for X. @@ -1037,7 +1032,6 @@ def __init__( self.minBinSize = minBinSize self.binningStrategy = binningStrategy - @support_usm_ndarray() def fit(self, X, y, sample_weight=None): """ Build a forest of trees from the training set (X, y). @@ -1172,7 +1166,6 @@ def fit(self, X, y, sample_weight=None): return self return super().fit(X, y, sample_weight=sample_weight) - @support_usm_ndarray() def predict(self, X): """ Predict class for X. diff --git a/daal4py/sklearn/linear_model/_coordinate_descent.py b/daal4py/sklearn/linear_model/_coordinate_descent.py index 93f5472db5..081d5652d5 100755 --- a/daal4py/sklearn/linear_model/_coordinate_descent.py +++ b/daal4py/sklearn/linear_model/_coordinate_descent.py @@ -46,8 +46,6 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.preprocessing import normalize -from .._device_offload import support_usm_ndarray - def _daal4py_check(self, X, y, check_input): _fptype = getFPType(X) @@ -688,11 +686,9 @@ def __init__( selection=selection, ) - @support_usm_ndarray() def fit(self, X, y, sample_weight=None, check_input=True): return _fit(self, X, y, sample_weight=sample_weight, check_input=check_input) - @support_usm_ndarray() def predict(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -736,11 +732,8 @@ def dual_gap_(self, value): def dual_gap_(self): self._gap = None - score = support_usm_ndarray()(ElasticNet_original.score) - fit.__doc__ = ElasticNet_original.fit.__doc__ predict.__doc__ = ElasticNet_original.predict.__doc__ - score.__doc__ = ElasticNet_original.score.__doc__ @control_n_jobs(decorated_methods=["fit", "predict"]) @@ -808,11 +801,9 @@ def __init__( selection=selection, ) - @support_usm_ndarray() def fit(self, X, y, sample_weight=None, check_input=True): return _fit(self, X, y, sample_weight, check_input) - @support_usm_ndarray() def predict(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -853,8 +844,5 @@ def dual_gap_(self, value): def dual_gap_(self): self._gap = None - score = support_usm_ndarray()(Lasso_original.score) - fit.__doc__ = Lasso_original.fit.__doc__ predict.__doc__ = Lasso_original.predict.__doc__ - score.__doc__ = Lasso_original.score.__doc__ diff --git a/daal4py/sklearn/linear_model/_linear.py b/daal4py/sklearn/linear_model/_linear.py index acf949d815..2b82b6f8a6 100644 --- a/daal4py/sklearn/linear_model/_linear.py +++ b/daal4py/sklearn/linear_model/_linear.py @@ -19,7 +19,6 @@ from sklearn.linear_model import LinearRegression as LinearRegression_original from sklearn.utils import check_array -from .._device_offload import support_usm_ndarray from .._utils import sklearn_check_version from ..utils.base import _daal_validate_data from ..utils.validation import _daal_check_array @@ -238,7 +237,6 @@ def __init__( positive=positive, ) - @support_usm_ndarray() def fit(self, X, y, sample_weight=None): if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): self._normalize = _deprecate_normalize( @@ -267,7 +265,6 @@ def fit(self, X, y, sample_weight=None): return super(LinearRegression, self).fit(X, y=y, sample_weight=sample_weight) return _fit_linear(self, X, y, sample_weight=sample_weight) - @support_usm_ndarray() def predict(self, X): return _predict_linear(self, X) diff --git a/daal4py/sklearn/linear_model/_ridge.py b/daal4py/sklearn/linear_model/_ridge.py index 7718d91605..037f458407 100644 --- a/daal4py/sklearn/linear_model/_ridge.py +++ b/daal4py/sklearn/linear_model/_ridge.py @@ -25,7 +25,6 @@ import daal4py -from .._device_offload import support_usm_ndarray from .._n_jobs_support import control_n_jobs from .._utils import ( PatchingConditionsChain, @@ -314,16 +313,11 @@ def __init__( self.solver = solver self.random_state = random_state - @support_usm_ndarray() def fit(self, X, y, sample_weight=None): return _fit_ridge(self, X, y, sample_weight=sample_weight) - @support_usm_ndarray() def predict(self, X): return _predict_ridge(self, X) - score = support_usm_ndarray()(Ridge_original.score) - fit.__doc__ = Ridge_original.fit.__doc__ predict.__doc__ = Ridge_original.predict.__doc__ - score.__doc__ = Ridge_original.score.__doc__ diff --git a/daal4py/sklearn/linear_model/logistic_path.py b/daal4py/sklearn/linear_model/logistic_path.py index 8ab97ea9d2..519279effb 100755 --- a/daal4py/sklearn/linear_model/logistic_path.py +++ b/daal4py/sklearn/linear_model/logistic_path.py @@ -73,8 +73,6 @@ from sklearn.linear_model._logistic import _logistic_regression_path as lr_path_original from sklearn.preprocessing import LabelBinarizer, LabelEncoder -from .._device_offload import support_usm_ndarray - # Code adapted from sklearn.linear_model.logistic version 0.21 def __logistic_regression_path( @@ -880,7 +878,6 @@ def daal4py_predict(self, X, resultsToEvaluate): return LogisticRegression_original.predict_log_proba(self, X) -@support_usm_ndarray() def logistic_regression_path( X, y, @@ -997,7 +994,6 @@ def __init__( self.n_jobs = n_jobs self.l1_ratio = l1_ratio - @support_usm_ndarray() def fit(self, X, y, sample_weight=None): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) @@ -1005,15 +1001,12 @@ def fit(self, X, y, sample_weight=None): self._validate_params() return daal4py_fit(self, X, y, sample_weight) - @support_usm_ndarray() def predict(self, X): return daal4py_predict(self, X, "computeClassLabels") - @support_usm_ndarray() def predict_log_proba(self, X): return daal4py_predict(self, X, "computeClassLogProbabilities") - @support_usm_ndarray() def predict_proba(self, X): return daal4py_predict(self, X, "computeClassProbabilities") diff --git a/daal4py/sklearn/manifold/_t_sne.py b/daal4py/sklearn/manifold/_t_sne.py index 1b06e7bdd9..614a576b3a 100755 --- a/daal4py/sklearn/manifold/_t_sne.py +++ b/daal4py/sklearn/manifold/_t_sne.py @@ -35,7 +35,6 @@ sklearn_check_version, ) -from .._device_offload import support_usm_ndarray from .._n_jobs_support import control_n_jobs from ..neighbors import NearestNeighbors @@ -47,11 +46,9 @@ class TSNE(BaseTSNE): if sklearn_check_version("1.2"): _parameter_constraints: dict = {**BaseTSNE._parameter_constraints} - @support_usm_ndarray() def fit_transform(self, X, y=None): return super().fit_transform(X, y) - @support_usm_ndarray() def fit(self, X, y=None): return super().fit(X, y) diff --git a/daal4py/sklearn/metrics/_pairwise.py b/daal4py/sklearn/metrics/_pairwise.py index 02a53458fa..432c0d60a1 100755 --- a/daal4py/sklearn/metrics/_pairwise.py +++ b/daal4py/sklearn/metrics/_pairwise.py @@ -45,7 +45,6 @@ def _precompute_metric_params(*args, **kwrds): import daal4py from daal4py.sklearn.utils.validation import _daal_check_array -from .._device_offload import support_usm_ndarray from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version if sklearn_check_version("1.3"): @@ -66,7 +65,6 @@ def _daal4py_correlation_distance_dense(X): return res.correlationDistance -@support_usm_ndarray(freefunc=True) def pairwise_distances( X, Y=None, metric="euclidean", *, n_jobs=None, force_all_finite=True, **kwds ): diff --git a/daal4py/sklearn/metrics/_ranking.py b/daal4py/sklearn/metrics/_ranking.py index c541703148..10343efaee 100644 --- a/daal4py/sklearn/metrics/_ranking.py +++ b/daal4py/sklearn/metrics/_ranking.py @@ -29,7 +29,6 @@ import daal4py as d4p -from .._device_offload import support_usm_ndarray from .._utils import PatchingConditionsChain, get_patch_message, sklearn_check_version from ..utils.validation import _assert_all_finite @@ -119,7 +118,6 @@ def _daal_type_of_target(y): return result -@support_usm_ndarray(freefunc=True) def roc_auc_score( y_true, y_score, diff --git a/daal4py/sklearn/model_selection/_split.py b/daal4py/sklearn/model_selection/_split.py index 07c2de72c1..f914f278a0 100644 --- a/daal4py/sklearn/model_selection/_split.py +++ b/daal4py/sklearn/model_selection/_split.py @@ -25,7 +25,6 @@ import daal4py as d4p from daal4py.sklearn._utils import PatchingConditionsChain -from .._device_offload import support_usm_ndarray from .._utils import sklearn_check_version try: @@ -63,7 +62,6 @@ def get_dtypes(data): return None -@support_usm_ndarray(freefunc=True) def train_test_split(*arrays, **options): n_arrays = len(arrays) if n_arrays == 0: diff --git a/daal4py/sklearn/neighbors/_classification.py b/daal4py/sklearn/neighbors/_classification.py index 0a2fc14dca..391b403999 100644 --- a/daal4py/sklearn/neighbors/_classification.py +++ b/daal4py/sklearn/neighbors/_classification.py @@ -24,7 +24,6 @@ ) from sklearn.utils.validation import check_array -from .._device_offload import support_usm_ndarray from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version from ._base import KNeighborsMixin, NeighborsBase, parse_auto_method, prediction_algorithm @@ -124,15 +123,12 @@ def __init__( weights if sklearn_check_version("1.0") else _check_weights(weights) ) - @support_usm_ndarray() def fit(self, X, y): return NeighborsBase._fit(self, X, y) - @support_usm_ndarray() def predict(self, X): return daal4py_classifier_predict(self, X, BaseKNeighborsClassifier.predict) - @support_usm_ndarray() def predict_proba(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) diff --git a/daal4py/sklearn/neighbors/_regression.py b/daal4py/sklearn/neighbors/_regression.py index 2fd0ee7c94..c779875c81 100644 --- a/daal4py/sklearn/neighbors/_regression.py +++ b/daal4py/sklearn/neighbors/_regression.py @@ -19,7 +19,6 @@ from sklearn.base import RegressorMixin from sklearn.neighbors._regression import KNeighborsRegressor as BaseKNeighborsRegressor -from .._device_offload import support_usm_ndarray from .._utils import sklearn_check_version from ._base import KNeighborsMixin, NeighborsBase @@ -63,11 +62,9 @@ def __init__( def _more_tags(self): return BaseKNeighborsRegressor._more_tags(self) - @support_usm_ndarray() def fit(self, X, y): return NeighborsBase._fit(self, X, y) - @support_usm_ndarray() def predict(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) diff --git a/daal4py/sklearn/neighbors/_unsupervised.py b/daal4py/sklearn/neighbors/_unsupervised.py index 0f4855dbea..5945ebf8f0 100644 --- a/daal4py/sklearn/neighbors/_unsupervised.py +++ b/daal4py/sklearn/neighbors/_unsupervised.py @@ -19,7 +19,6 @@ from sklearn.neighbors import NearestNeighbors as BaseNearestNeighbors from sklearn.utils.validation import _deprecate_positional_args -from .._device_offload import support_usm_ndarray from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin @@ -50,7 +49,6 @@ def __init__( n_jobs=n_jobs, ) - @support_usm_ndarray() def fit(self, X, y=None): return NeighborsBase._fit(self, X) diff --git a/daal4py/sklearn/svm/svm.py b/daal4py/sklearn/svm/svm.py index 40d2c70685..38c4f84b6f 100644 --- a/daal4py/sklearn/svm/svm.py +++ b/daal4py/sklearn/svm/svm.py @@ -16,8 +16,6 @@ from __future__ import print_function -import warnings - import numpy as np import sklearn.svm._base as svm_base import sklearn.svm._classes as svm_classes diff --git a/daal4py/sklearn/tree/decision_tree.py b/daal4py/sklearn/tree/decision_tree.py index ad669ada4b..bdc81bf642 100644 --- a/daal4py/sklearn/tree/decision_tree.py +++ b/daal4py/sklearn/tree/decision_tree.py @@ -28,7 +28,6 @@ import daal4py as d4p -from .._device_offload import support_usm_ndarray from .._utils import getFPType, make2d @@ -141,7 +140,6 @@ def get_depth(self): ts = self._get_tree_state() return ts.max_depth - @support_usm_ndarray() def fit(self, X, y, sample_weight=None, pruning_set=None): """Build a decision tree classifier from the training set (X, y). @@ -283,14 +281,12 @@ def _daal4py_predict(self, X): res = alg.compute(X, self.daal_model_) return res.prediction.ravel() - @support_usm_ndarray() def predict(self, X, check_input=True): check_is_fitted(self, "daal_model_") X = self._validate_X_predict(X, check_input) y = self._daal4py_predict(X) return self.classes_.take(np.asarray(y, dtype=np.intp), axis=0) - @support_usm_ndarray() def predict_proba(self, X, check_input=True): check_is_fitted(self, "daal_model_") X = self._validate_X_predict(X, check_input) diff --git a/onedal/_config.py b/onedal/_config.py new file mode 100644 index 0000000000..8c93929d87 --- /dev/null +++ b/onedal/_config.py @@ -0,0 +1,53 @@ +# ============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tools to expose some sklearnex's config settings to onedal4py level.""" + +import threading + +_default_global_config = { + "target_offload": "auto", + "allow_fallback_to_host": False, +} + +_threadlocal = threading.local() + + +def _get_onedal_threadlocal_config(): + if not hasattr(_threadlocal, "global_config"): + _threadlocal.global_config = _default_global_config.copy() + return _threadlocal.global_config + + +def _get_config(copy=True): + """Retrieve current values for configuration set + by :func:`sklearnex.set_config` + Parameters + ---------- + copy : bool, default=True + If False, the values ​​of the global config are returned, + which can further be overwritten. + Returns + ------- + config : dict + Keys are parameter names `target_offload` and + `allow_fallback_to_host` that can be passed + to :func:`sklearnex.set_config`. + """ + onedal_config = _get_onedal_threadlocal_config() + if copy: + onedal_config = onedal_config.copy() + return onedal_config diff --git a/onedal/_device_offload.py b/onedal/_device_offload.py index fcb9927b4e..c31979e35c 100644 --- a/onedal/_device_offload.py +++ b/onedal/_device_offload.py @@ -14,28 +14,135 @@ # limitations under the License. # ============================================================================== +import logging +from collections.abc import Iterable from functools import wraps +import numpy as np + +from ._config import _get_config + try: - import dpnp + from dpctl import SyclQueue + from dpctl.memory import MemoryUSMDevice, as_usm_memory + from dpctl.tensor import usm_ndarray - dpnp_available = True + dpctl_available = True except ImportError: - dpnp_available = False + dpctl_available = False try: - from sklearnex._device_offload import ( - _copy_to_usm, - _get_global_queue, - _transfer_to_host, - ) + import dpnp - _sklearnex_available = True + dpnp_available = True except ImportError: - import logging + dpnp_available = False + - logging.warning("Device support requires " "Intel(R) Extension for Scikit-learn*.") - _sklearnex_available = False +class DummySyclQueue: + """This class is designed to act like dpctl.SyclQueue + to allow device dispatching in scenarios when dpctl is not available""" + + class DummySyclDevice: + def __init__(self, filter_string): + self._filter_string = filter_string + self.is_cpu = "cpu" in filter_string + self.is_gpu = "gpu" in filter_string + self.has_aspect_fp64 = self.is_cpu + + if not (self.is_cpu): + logging.warning( + "Device support is limited. " + "Please install dpctl for full experience" + ) + + def get_filter_string(self): + return self._filter_string + + def __init__(self, filter_string): + self.sycl_device = self.DummySyclDevice(filter_string) + + +def _copy_to_usm(queue, array): + if not dpctl_available: + raise RuntimeError( + "dpctl need to be installed to work " "with __sycl_usm_array_interface__" + ) + + if hasattr(array, "__array__"): + + try: + mem = MemoryUSMDevice(array.nbytes, queue=queue) + mem.copy_from_host(array.tobytes()) + return usm_ndarray(array.shape, array.dtype, buffer=mem) + except ValueError as e: + # ValueError will raise if device does not support the dtype + # retry with float32 (needed for fp16 and fp64 support issues) + # try again as float32, if it is a float32 just raise the error. + if array.dtype == np.float32: + raise e + return _copy_to_usm(queue, array.astype(np.float32)) + else: + if isinstance(array, Iterable): + array = [_copy_to_usm(queue, i) for i in array] + return array + + +def _transfer_to_host(queue, *data): + has_usm_data, has_host_data = False, False + + host_data = [] + for item in data: + usm_iface = getattr(item, "__sycl_usm_array_interface__", None) + if usm_iface is not None: + if not dpctl_available: + raise RuntimeError( + "dpctl need to be installed to work " + "with __sycl_usm_array_interface__" + ) + if queue is not None: + if queue.sycl_device != usm_iface["syclobj"].sycl_device: + raise RuntimeError( + "Input data shall be located " "on single target device" + ) + else: + queue = usm_iface["syclobj"] + + buffer = as_usm_memory(item).copy_to_host() + order = "C" + if usm_iface["strides"] is not None: + if usm_iface["strides"][0] < usm_iface["strides"][1]: + order = "F" + item = np.ndarray( + shape=usm_iface["shape"], + dtype=usm_iface["typestr"], + buffer=buffer, + order=order, + ) + has_usm_data = True + else: + has_host_data = True + + mismatch_host_item = usm_iface is None and item is not None and has_usm_data + mismatch_usm_item = usm_iface is not None and has_host_data + + if mismatch_host_item or mismatch_usm_item: + raise RuntimeError("Input data shall be located on single target device") + + host_data.append(item) + return queue, host_data + + +def _get_global_queue(): + target = _get_config()["target_offload"] + + QueueClass = DummySyclQueue if not dpctl_available else SyclQueue + + if target != "auto": + if isinstance(target, QueueClass): + return target + return QueueClass(target) + return None def _get_host_inputs(*args, **kwargs): @@ -59,24 +166,30 @@ def _run_on_device(func, obj=None, *args, **kwargs): return func(*args, **kwargs) -def support_usm_ndarray(freefunc=False): +if dpnp_available: + + def _convert_to_dpnp(array): + if isinstance(array, usm_ndarray): + return dpnp.array(array, copy=False) + elif isinstance(array, Iterable): + for i in range(len(array)): + array[i] = _convert_to_dpnp(array[i]) + return array + + +def support_usm_ndarray(freefunc=False, queue_param=True): def decorator(func): def wrapper_impl(obj, *args, **kwargs): - if _sklearnex_available: - usm_iface = _extract_usm_iface(*args, **kwargs) - data_queue, hostargs, hostkwargs = _get_host_inputs(*args, **kwargs) + usm_iface = _extract_usm_iface(*args, **kwargs) + data_queue, hostargs, hostkwargs = _get_host_inputs(*args, **kwargs) + if queue_param: hostkwargs["queue"] = data_queue - result = _run_on_device(func, obj, *hostargs, **hostkwargs) - if usm_iface is not None and hasattr(result, "__array_interface__"): - result = _copy_to_usm(data_queue, result) - if ( - dpnp_available - and len(args) > 0 - and isinstance(args[0], dpnp.ndarray) - ): - result = dpnp.array(result, copy=False) - return result - return _run_on_device(func, obj, *args, **kwargs) + result = _run_on_device(func, obj, *hostargs, **hostkwargs) + if usm_iface is not None and hasattr(result, "__array_interface__"): + result = _copy_to_usm(data_queue, result) + if dpnp_available and len(args) > 0 and isinstance(args[0], dpnp.ndarray): + result = _convert_to_dpnp(result) + return result if freefunc: diff --git a/onedal/common/_policy.py b/onedal/common/_policy.py index d5991606a6..90705854f6 100644 --- a/onedal/common/_policy.py +++ b/onedal/common/_policy.py @@ -18,10 +18,6 @@ from onedal import _backend, _is_dpc_backend -oneapi_is_available = "daal4py.oneapi" in sys.modules -if oneapi_is_available: - from daal4py.oneapi import _get_sycl_ctxt, sycl_execution_context - def _get_policy(queue, *data): data_queue = _get_queue(*data) @@ -46,33 +42,18 @@ def _get_queue(*data): return None -class _Daal4PyContextReset: - def __init__(self): - self._d4p_context = None - if oneapi_is_available: - self._d4p_context = _get_sycl_ctxt() - - def __del__(self): - if self._d4p_context: - self._d4p_context.apply() - - class _HostInteropPolicy(_backend.host_policy): def __init__(self): super().__init__() - self._d4p_interop = _Daal4PyContextReset() if _is_dpc_backend: + from onedal._device_offload import DummySyclQueue class _DataParallelInteropPolicy(_backend.data_parallel_policy): def __init__(self, queue): self._queue = queue - self._d4p_interop = _Daal4PyContextReset() - if "sklearnex" in sys.modules: - from sklearnex._device_offload import DummySyclQueue - - if isinstance(queue, DummySyclQueue): - super().__init__(self._queue.sycl_device.get_filter_string()) - return + if isinstance(queue, DummySyclQueue): + super().__init__(self._queue.sycl_device.get_filter_string()) + return super().__init__(self._queue) diff --git a/sklearnex/_config.py b/sklearnex/_config.py index cf65a66b18..bbab9be2bb 100644 --- a/sklearnex/_config.py +++ b/sklearnex/_config.py @@ -14,24 +14,12 @@ # limitations under the License. # ============================================================================== -import threading from contextlib import contextmanager from sklearn import get_config as skl_get_config from sklearn import set_config as skl_set_config -_default_global_config = { - "target_offload": "auto", - "allow_fallback_to_host": False, -} - -_threadlocal = threading.local() - - -def _get_sklearnex_threadlocal_config(): - if not hasattr(_threadlocal, "global_config"): - _threadlocal.global_config = _default_global_config.copy() - return _threadlocal.global_config +from onedal._config import _get_config as onedal_get_config def get_config(): @@ -46,7 +34,7 @@ def get_config(): set_config : Set global configuration. """ sklearn = skl_get_config() - sklearnex = _get_sklearnex_threadlocal_config().copy() + sklearnex = onedal_get_config() return {**sklearn, **sklearnex} @@ -70,7 +58,7 @@ def set_config(target_offload=None, allow_fallback_to_host=None, **sklearn_confi """ skl_set_config(**sklearn_configs) - local_config = _get_sklearnex_threadlocal_config() + local_config = onedal_get_config(copy=False) if target_offload is not None: local_config["target_offload"] = target_offload diff --git a/sklearnex/_device_offload.py b/sklearnex/_device_offload.py index 7f409f521e..fbb862d41e 100644 --- a/sklearnex/_device_offload.py +++ b/sklearnex/_device_offload.py @@ -14,141 +14,21 @@ # limitations under the License. # ============================================================================== -import logging -import sys -from collections.abc import Iterable from functools import wraps -import numpy as np +from onedal._device_offload import ( + _copy_to_usm, + _get_global_queue, + _transfer_to_host, + dpnp_available, +) -try: - from dpctl import SyclQueue - from dpctl.memory import MemoryUSMDevice, as_usm_memory - from dpctl.tensor import usm_ndarray - - dpctl_available = True -except ImportError: - dpctl_available = False - -try: +if dpnp_available: import dpnp + from onedal._device_offload import _convert_to_dpnp - dpnp_available = True -except ImportError: - dpnp_available = False from ._config import get_config -from ._utils import get_patch_message - -oneapi_is_available = "daal4py.oneapi" in sys.modules -if oneapi_is_available: - from daal4py.oneapi import _get_device_name_sycl_ctxt, _get_sycl_ctxt_params - - -class DummySyclQueue: - """This class is designed to act like dpctl.SyclQueue - to allow device dispatching in scenarios when dpctl is not available""" - - class DummySyclDevice: - def __init__(self, filter_string): - self._filter_string = filter_string - self.is_cpu = "cpu" in filter_string - self.is_gpu = "gpu" in filter_string - # TODO: check for possibility of fp64 support - # on other devices in this dummy class - self.has_aspect_fp64 = self.is_cpu - - if not (self.is_cpu): - logging.warning( - "Device support is limited. " - "Please install dpctl for full experience" - ) - - def get_filter_string(self): - return self._filter_string - - def __init__(self, filter_string): - self.sycl_device = self.DummySyclDevice(filter_string) - - -def _get_device_info_from_daal4py(): - if oneapi_is_available: - return _get_device_name_sycl_ctxt(), _get_sycl_ctxt_params() - return None, dict() - - -def _get_global_queue(): - target = get_config()["target_offload"] - d4p_target, _ = _get_device_info_from_daal4py() - if d4p_target == "host": - d4p_target = "cpu" - - QueueClass = DummySyclQueue if not dpctl_available else SyclQueue - - if target != "auto": - if d4p_target is not None and d4p_target != target: - if not isinstance(target, str): - if d4p_target not in target.sycl_device.get_filter_string(): - raise RuntimeError( - "Cannot use target offload option " - "inside daal4py.oneapi.sycl_context" - ) - else: - raise RuntimeError( - "Cannot use target offload option " - "inside daal4py.oneapi.sycl_context" - ) - if isinstance(target, QueueClass): - return target - return QueueClass(target) - if d4p_target is not None: - return QueueClass(d4p_target) - return None - - -def _transfer_to_host(queue, *data): - has_usm_data, has_host_data = False, False - - host_data = [] - for item in data: - usm_iface = getattr(item, "__sycl_usm_array_interface__", None) - if usm_iface is not None: - if not dpctl_available: - raise RuntimeError( - "dpctl need to be installed to work " - "with __sycl_usm_array_interface__" - ) - if queue is not None: - if queue.sycl_device != usm_iface["syclobj"].sycl_device: - raise RuntimeError( - "Input data shall be located " "on single target device" - ) - else: - queue = usm_iface["syclobj"] - - buffer = as_usm_memory(item).copy_to_host() - order = "C" - if usm_iface["strides"] is not None: - if usm_iface["strides"][0] < usm_iface["strides"][1]: - order = "F" - item = np.ndarray( - shape=usm_iface["shape"], - dtype=usm_iface["typestr"], - buffer=buffer, - order=order, - ) - has_usm_data = True - else: - has_host_data = True - - mismatch_host_item = usm_iface is None and item is not None and has_usm_data - mismatch_usm_item = usm_iface is not None and has_host_data - - if mismatch_host_item or mismatch_usm_item: - raise RuntimeError("Input data shall be located on single target device") - - host_data.append(item) - return queue, host_data def _get_backend(obj, queue, method_name, *data): @@ -162,10 +42,7 @@ def _get_backend(obj, queue, method_name, *data): else: return "sklearn", None, patching_status - _, d4p_options = _get_device_info_from_daal4py() - allow_fallback_to_host = get_config()["allow_fallback_to_host"] or d4p_options.get( - "host_offload_on_fail", False - ) + allow_fallback_to_host = get_config()["allow_fallback_to_host"] if gpu_device: patching_status = obj._onedal_gpu_supported(method_name, *data) @@ -203,42 +80,6 @@ def dispatch(obj, method_name, branches, *args, **kwargs): ) -def _copy_to_usm(queue, array): - if not dpctl_available: - raise RuntimeError( - "dpctl need to be installed to work " "with __sycl_usm_array_interface__" - ) - - if hasattr(array, "__array__"): - - try: - mem = MemoryUSMDevice(array.nbytes, queue=queue) - mem.copy_from_host(array.tobytes()) - return usm_ndarray(array.shape, array.dtype, buffer=mem) - except ValueError as e: - # ValueError will raise if device does not support the dtype - # retry with float32 (needed for fp16 and fp64 support issues) - # try again as float32, if it is a float32 just raise the error. - if array.dtype == np.float32: - raise e - return _copy_to_usm(queue, array.astype(np.float32)) - else: - if isinstance(array, Iterable): - array = [_copy_to_usm(queue, i) for i in array] - return array - - -if dpnp_available: - - def _convert_to_dpnp(array): - if isinstance(array, usm_ndarray): - return dpnp.array(array, copy=False) - elif isinstance(array, Iterable): - for i in range(len(array)): - array[i] = _convert_to_dpnp(array[i]) - return array - - def wrap_output_data(func): @wraps(func) def wrapper(self, *args, **kwargs): diff --git a/sklearnex/cluster/dbscan.py b/sklearnex/cluster/dbscan.py index db67f7cf6b..f8d080cfbe 100755 --- a/sklearnex/cluster/dbscan.py +++ b/sklearnex/cluster/dbscan.py @@ -17,7 +17,6 @@ import numbers from abc import ABC -import numpy as np from scipy import sparse as sp from sklearn.cluster import DBSCAN as sklearn_DBSCAN from sklearn.utils.validation import _check_sample_weight diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 41171730b6..eac4c22bce 100755 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -15,3 +15,9 @@ # =============================================================================== from daal4py.sklearn.cluster import KMeans +from onedal._device_offload import support_usm_ndarray + +KMeans.fit = support_usm_ndarray(queue_param=False)(KMeans.fit) +KMeans.fit_predict = support_usm_ndarray(queue_param=False)(KMeans.fit_predict) +KMeans.predict = support_usm_ndarray(queue_param=False)(KMeans.predict) +KMeans.score = support_usm_ndarray(queue_param=False)(KMeans.score) diff --git a/sklearnex/covariance/incremental_covariance.py b/sklearnex/covariance/incremental_covariance.py index 63b1316fc9..75f1f8bd7e 100644 --- a/sklearnex/covariance/incremental_covariance.py +++ b/sklearnex/covariance/incremental_covariance.py @@ -25,7 +25,6 @@ from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import daal_check_version, sklearn_check_version -from onedal._device_offload import support_usm_ndarray from onedal.covariance import ( IncrementalEmpiricalCovariance as onedal_IncrementalEmpiricalCovariance, ) diff --git a/sklearnex/dispatcher.py b/sklearnex/dispatcher.py index 2a4a085ac0..8bac516041 100644 --- a/sklearnex/dispatcher.py +++ b/sklearnex/dispatcher.py @@ -111,6 +111,9 @@ def get_patch_map_core(preview=False): import sklearn.decomposition as decomposition_module import sklearn.ensemble as ensemble_module import sklearn.linear_model as linear_model_module + import sklearn.manifold as manifold_module + import sklearn.metrics as metrics_module + import sklearn.model_selection as model_selection_module import sklearn.neighbors as neighbors_module import sklearn.svm as svm_module @@ -138,11 +141,18 @@ def get_patch_map_core(preview=False): from .ensemble import ExtraTreesRegressor as ExtraTreesRegressor_sklearnex from .ensemble import RandomForestClassifier as RandomForestClassifier_sklearnex from .ensemble import RandomForestRegressor as RandomForestRegressor_sklearnex + from .linear_model import ElasticNet as ElasticNet_sklearnex from .linear_model import ( IncrementalLinearRegression as IncrementalLinearRegression_sklearnex, ) + from .linear_model import Lasso as Lasso_sklearnex from .linear_model import LinearRegression as LinearRegression_sklearnex from .linear_model import LogisticRegression as LogisticRegression_sklearnex + from .linear_model import Ridge as Ridge_sklearnex + from .manifold import TSNE as TSNE_sklearnex + from .metrics import pairwise_distances as pairwise_distances_sklearnex + from .metrics import roc_auc_score as roc_auc_score_sklearnex + from .model_selection import train_test_split as train_test_split_sklearnex from .neighbors import KNeighborsClassifier as KNeighborsClassifier_sklearnex from .neighbors import KNeighborsRegressor as KNeighborsRegressor_sklearnex from .neighbors import LocalOutlierFactor as LocalOutlierFactor_sklearnex @@ -168,6 +178,32 @@ def get_patch_map_core(preview=False): mapping["nusvr"] = [[(svm_module, "NuSVR", NuSVR_sklearnex), None]] mapping["nusvc"] = [[(svm_module, "NuSVC", NuSVC_sklearnex), None]] + # ElasticNet + mapping.pop("elasticnet") + mapping["elasticnet"] = [ + [ + ( + linear_model_module, + "ElasticNet", + ElasticNet_sklearnex, + ), + None, + ] + ] + + # Lasso + mapping.pop("lasso") + mapping["lasso"] = [ + [ + ( + linear_model_module, + "Lasso", + Lasso_sklearnex, + ), + None, + ] + ] + # Linear Regression mapping.pop("linear") mapping.pop("linearregression") @@ -201,6 +237,54 @@ def get_patch_map_core(preview=False): ] mapping["logisticregression"] = mapping["log_reg"] + # Ridge + mapping.pop("ridge") + mapping["ridge"] = [ + [ + ( + linear_model_module, + "Ridge", + Ridge_sklearnex, + ), + None, + ] + ] + + # manifold + mapping.pop("tsne") + mapping["tsne"] = [ + [ + (manifold_module, "TSNE", TSNE_sklearnex), + None, + ] + ] + + # metrics + mapping.pop("distances") + mapping.pop("roc_auc_score") + mapping["distances"] = [ + [ + (metrics_module, "pairwise_distances", pairwise_distances_sklearnex), + None, + ] + ] + mapping["pairwise_distances"] = mapping["distances"] + mapping["roc_auc_score"] = [ + [ + (metrics_module, "roc_auc_score", roc_auc_score_sklearnex), + None, + ] + ] + + # model_selection + mapping.pop("train_test_split") + mapping["train_test_split"] = [ + [ + (model_selection_module, "train_test_split", train_test_split_sklearnex), + None, + ] + ] + # kNN mapping.pop("knn_classifier") mapping.pop("kneighborsclassifier") diff --git a/sklearnex/linear_model/coordinate_descent.py b/sklearnex/linear_model/coordinate_descent.py index 3fd6abd7b3..e7a4b5861e 100644 --- a/sklearnex/linear_model/coordinate_descent.py +++ b/sklearnex/linear_model/coordinate_descent.py @@ -15,3 +15,12 @@ # =============================================================================== from daal4py.sklearn.linear_model import ElasticNet, Lasso +from onedal._device_offload import support_usm_ndarray + +ElasticNet.fit = support_usm_ndarray(queue_param=False)(ElasticNet.fit) +ElasticNet.predict = support_usm_ndarray(queue_param=False)(ElasticNet.predict) +ElasticNet.score = support_usm_ndarray(queue_param=False)(ElasticNet.score) + +Lasso.fit = support_usm_ndarray(queue_param=False)(Lasso.fit) +Lasso.predict = support_usm_ndarray(queue_param=False)(Lasso.predict) +Lasso.score = support_usm_ndarray(queue_param=False)(Lasso.score) diff --git a/sklearnex/linear_model/ridge.py b/sklearnex/linear_model/ridge.py index 59222deaa8..97eae002b8 100644 --- a/sklearnex/linear_model/ridge.py +++ b/sklearnex/linear_model/ridge.py @@ -15,3 +15,8 @@ # =============================================================================== from daal4py.sklearn.linear_model import Ridge +from onedal._device_offload import support_usm_ndarray + +Ridge.fit = support_usm_ndarray(queue_param=False)(Ridge.fit) +Ridge.predict = support_usm_ndarray(queue_param=False)(Ridge.predict) +Ridge.score = support_usm_ndarray(queue_param=False)(Ridge.score) diff --git a/sklearnex/manifold/t_sne.py b/sklearnex/manifold/t_sne.py index d74c91ffee..3e5c99f43f 100755 --- a/sklearnex/manifold/t_sne.py +++ b/sklearnex/manifold/t_sne.py @@ -15,3 +15,7 @@ # =============================================================================== from daal4py.sklearn.manifold import TSNE +from onedal._device_offload import support_usm_ndarray + +TSNE.fit = support_usm_ndarray(queue_param=False)(TSNE.fit) +TSNE.fit_transform = support_usm_ndarray(queue_param=False)(TSNE.fit_transform) diff --git a/sklearnex/metrics/pairwise.py b/sklearnex/metrics/pairwise.py index e42ca6ae65..041e3b5b8d 100755 --- a/sklearnex/metrics/pairwise.py +++ b/sklearnex/metrics/pairwise.py @@ -15,3 +15,8 @@ # =============================================================================== from daal4py.sklearn.metrics import pairwise_distances +from onedal._device_offload import support_usm_ndarray + +pairwise_distances = support_usm_ndarray(freefunc=True, queue_param=False)( + pairwise_distances +) diff --git a/sklearnex/metrics/ranking.py b/sklearnex/metrics/ranking.py index 7b424b8419..b282bb3f94 100755 --- a/sklearnex/metrics/ranking.py +++ b/sklearnex/metrics/ranking.py @@ -15,3 +15,6 @@ # =============================================================================== from daal4py.sklearn.metrics import roc_auc_score +from onedal._device_offload import support_usm_ndarray + +roc_auc_score = support_usm_ndarray(freefunc=True, queue_param=False)(roc_auc_score) diff --git a/sklearnex/model_selection/split.py b/sklearnex/model_selection/split.py index 1a2adaa4c7..db20ca17b8 100755 --- a/sklearnex/model_selection/split.py +++ b/sklearnex/model_selection/split.py @@ -15,3 +15,6 @@ # =============================================================================== from daal4py.sklearn.model_selection import train_test_split +from onedal._device_offload import support_usm_ndarray + +train_test_split = support_usm_ndarray(freefunc=True, queue_param=False)(train_test_split) diff --git a/sklearnex/tests/test_config.py b/sklearnex/tests/test_config.py index fc8fd2df3d..5ff30587d7 100644 --- a/sklearnex/tests/test_config.py +++ b/sklearnex/tests/test_config.py @@ -16,6 +16,7 @@ import sklearn +import onedal import sklearnex @@ -33,7 +34,10 @@ def test_set_config_works(): ) config = sklearnex.get_config() + onedal_config = onedal._config._get_config() assert config["target_offload"] == "cpu:0" assert config["allow_fallback_to_host"] assert config["assume_finite"] + assert onedal_config["target_offload"] == "cpu:0" + assert onedal_config["allow_fallback_to_host"] sklearnex.set_config(**default_config) From 23a0197af1a5e5a06f2be3c370120e5d0df49847 Mon Sep 17 00:00:00 2001 From: Ben Moore Date: Thu, 27 Jun 2024 05:49:36 -0700 Subject: [PATCH 42/75] Update INSTALL.md (#1883) Remove option for Anaconda main channel as the Intel license has now expired. --- INSTALL.md | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 63930c611b..9717d497c0 100755 --- a/INSTALL.md +++ b/INSTALL.md @@ -26,7 +26,6 @@ To install Intel(R) Extension for Scikit-learn*, use one of the following scenar - [Install from Anaconda Cloud](#install-from-anaconda-cloud) - [Install via Anaconda Cloud from Conda-Forge Channel](#install-via-anaconda-cloud-from-conda-forge-channel) - [Install via Anaconda Cloud from Intel Channel](#install-via-anaconda-cloud-from-intel-channel) - - [Install via Anaconda Cloud from Main Channel](#install-via-anaconda-cloud-from-main-channel) - [Build from Sources](#build-from-sources) - [Prerequisites](#prerequisites) - [Configure the Build with Environment Variables](#configure-the-build-with-environment-variables) @@ -52,7 +51,6 @@ Applicable for: * PyPI * Anaconda Cloud from Conda-Forge Channel * Anaconda Cloud from Intel Channel -* Anaconda Cloud from Main Channel @@ -128,16 +126,6 @@ We recommend this installation for the users of Intel® Distribution for Python. conda install scikit-learn-intelex ``` -### Install via Anaconda Cloud from Main Channel - -> **_NOTE:_** You may not find the latest version on the Anaconda Main channel since it usually lags on versions deployed. - -- Install into a newly created environment (recommended): - - ```bash - conda create -n env python=3.10 scikit-learn-intelex - ``` - > **_NOTE:_** If you do not specify the version of Python, the latest one is downloaded. - Install into your current environment: From 421e377615e599137e9c98ca51e90da350baad6b Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Thu, 27 Jun 2024 09:54:28 -0700 Subject: [PATCH 43/75] Update dependency pybind11 to v2.13.1 (#1899) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- dependencies-dev | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependencies-dev b/dependencies-dev index 650c1f5eb1..043e53ae71 100644 --- a/dependencies-dev +++ b/dependencies-dev @@ -2,5 +2,5 @@ Cython==3.0.10 Jinja2==3.1.4 numpy==1.19.5 ; python_version < '3.9' numpy==2.0.0 ; python_version >= '3.9' -pybind11==2.13.0 +pybind11==2.13.1 cmake==3.29.6 From da93490586e6a605750703c1c8d42bc4e46c0350 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Thu, 27 Jun 2024 11:09:32 -0700 Subject: [PATCH 44/75] Update dependency shap to v0.46.0 (#1900) * Update dependency shap to v0.46.0 * Checking numpy 2.0 compatbility --------- Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: ethanglaser <42726565+ethanglaser@users.noreply.github.com> --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 64af54f1bf..42ca0e9ff1 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -13,5 +13,5 @@ xgboost==2.1.0 lightgbm==4.4.0 catboost==1.2.5 ; python_version < '3.11' # TODO: Remove 3.11 condition when catboost supports numpy 2.0 shap==0.44.1 ; python_version == '3.8' -shap==0.45.1 ; python_version >= '3.9' and python_version < '3.11' # TODO: Remove 3.12 condition when shap/numba support numpy 2.0 +shap==0.46.0 ; python_version >= '3.9' array-api-strict==1.1.1 ; python_version >= '3.9' From a4d6c70422fa63c63f795991feb3f6b3779cad7e Mon Sep 17 00:00:00 2001 From: Samir Nasibli Date: Fri, 28 Jun 2024 11:05:37 +0200 Subject: [PATCH 45/75] TEST: enabled different dataframe testing for linear models: `ElasticNet`, `Lasso`, `Ridge` (#1891) * TEST: enabled different dataframe testing for linear models * enabled dataframe testing for ElasticNet, Lasso, Ridge * Added comments about linear models GPU support --- sklearnex/linear_model/coordinate_descent.py | 4 ++++ sklearnex/linear_model/ridge.py | 2 ++ sklearnex/linear_model/tests/test_linear.py | 15 ++++++++++++--- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/sklearnex/linear_model/coordinate_descent.py b/sklearnex/linear_model/coordinate_descent.py index e7a4b5861e..f117d9c086 100644 --- a/sklearnex/linear_model/coordinate_descent.py +++ b/sklearnex/linear_model/coordinate_descent.py @@ -17,10 +17,14 @@ from daal4py.sklearn.linear_model import ElasticNet, Lasso from onedal._device_offload import support_usm_ndarray +# Note: `sklearnex.linear_model.ElasticNet` only has functional +# sycl GPU support. No GPU device will be offloaded. ElasticNet.fit = support_usm_ndarray(queue_param=False)(ElasticNet.fit) ElasticNet.predict = support_usm_ndarray(queue_param=False)(ElasticNet.predict) ElasticNet.score = support_usm_ndarray(queue_param=False)(ElasticNet.score) +# Note: `sklearnex.linear_model.Lasso` only has functional +# sycl GPU support. No GPU device will be offloaded. Lasso.fit = support_usm_ndarray(queue_param=False)(Lasso.fit) Lasso.predict = support_usm_ndarray(queue_param=False)(Lasso.predict) Lasso.score = support_usm_ndarray(queue_param=False)(Lasso.score) diff --git a/sklearnex/linear_model/ridge.py b/sklearnex/linear_model/ridge.py index 97eae002b8..53a0a0702f 100644 --- a/sklearnex/linear_model/ridge.py +++ b/sklearnex/linear_model/ridge.py @@ -17,6 +17,8 @@ from daal4py.sklearn.linear_model import Ridge from onedal._device_offload import support_usm_ndarray +# Note: `sklearnex.linear_model.Ridge` only has functional +# sycl GPU support. No GPU device will be offloaded. Ridge.fit = support_usm_ndarray(queue_param=False)(Ridge.fit) Ridge.predict = support_usm_ndarray(queue_param=False)(Ridge.predict) Ridge.score = support_usm_ndarray(queue_param=False)(Ridge.score) diff --git a/sklearnex/linear_model/tests/test_linear.py b/sklearnex/linear_model/tests/test_linear.py index b46d2ab315..b87a701960 100644 --- a/sklearnex/linear_model/tests/test_linear.py +++ b/sklearnex/linear_model/tests/test_linear.py @@ -57,32 +57,41 @@ def test_sklearnex_import_linear(dataframe, queue, dtype, macro_block): assert_allclose(_as_numpy(linreg.coef_), [1.0, 2.0], rtol=tol) -def test_sklearnex_import_ridge(): +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +def test_sklearnex_import_ridge(dataframe, queue): from sklearnex.linear_model import Ridge X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) y = np.dot(X, np.array([1, 2])) + 3 + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + y = _convert_to_dataframe(y, sycl_queue=queue, target_df=dataframe) ridgereg = Ridge().fit(X, y) assert "daal4py" in ridgereg.__module__ assert_allclose(ridgereg.intercept_, 4.5) assert_allclose(ridgereg.coef_, [0.8, 1.4]) -def test_sklearnex_import_lasso(): +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +def test_sklearnex_import_lasso(dataframe, queue): from sklearnex.linear_model import Lasso X = [[0, 0], [1, 1], [2, 2]] y = [0, 1, 2] + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + y = _convert_to_dataframe(y, sycl_queue=queue, target_df=dataframe) lasso = Lasso(alpha=0.1).fit(X, y) assert "daal4py" in lasso.__module__ assert_allclose(lasso.intercept_, 0.15) assert_allclose(lasso.coef_, [0.85, 0.0]) -def test_sklearnex_import_elastic(): +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +def test_sklearnex_import_elastic(dataframe, queue): from sklearnex.linear_model import ElasticNet X, y = make_regression(n_features=2, random_state=0) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + y = _convert_to_dataframe(y, sycl_queue=queue, target_df=dataframe) elasticnet = ElasticNet(random_state=0).fit(X, y) assert "daal4py" in elasticnet.__module__ assert_allclose(elasticnet.intercept_, 1.451, atol=1e-3) From 08f941803009f3b6f0dad401529a28a20b8f1492 Mon Sep 17 00:00:00 2001 From: Samir Nasibli Date: Fri, 28 Jun 2024 18:55:33 +0200 Subject: [PATCH 46/75] CI: bump up dpnp/dpctl versions (#1903) update dpnp to 0.15 version update dpctl to 0.17 version --- .ci/pipeline/build-and-test-lnx.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/pipeline/build-and-test-lnx.yml b/.ci/pipeline/build-and-test-lnx.yml index f67a7607a4..7f9c8ae3a2 100644 --- a/.ci/pipeline/build-and-test-lnx.yml +++ b/.ci/pipeline/build-and-test-lnx.yml @@ -46,7 +46,7 @@ steps: bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION) pip install --upgrade -r requirements-test.txt pip install $(python .ci/scripts/get_compatible_scipy_version.py) - if [ $(echo $(PYTHON_VERSION) | grep '3.9\|3.10') ] && [ $(SKLEARN_VERSION) != "1.0" ]; then conda install -q -y -c intel dpctl=0.16.0 dpnp=0.14.0; fi + if [ $(echo $(PYTHON_VERSION) | grep '3.9\|3.10\|3.11') ] && [ $(SKLEARN_VERSION) != "1.0" ]; then conda install -q -y -c intel dpctl=0.17.0 dpnp=0.15.0; fi pip list displayName: "Install testing requirements" - script: | From 066444016912942bb3642b2a97c1f19c714f025f Mon Sep 17 00:00:00 2001 From: Samir Nasibli Date: Mon, 1 Jul 2024 12:38:43 +0200 Subject: [PATCH 47/75] MAINT: update pyproject.toml file (#1905) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 290e5f2056..3255e3fa58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ [tool.black] line-length = 90 -target-version = ['py37', 'py38', 'py39', 'py310', 'py311'] +target-version = ['py39', 'py310', 'py311', 'py312'] extend-ignore = 'E203' [tool.isort] From cf0abbf57104ca8e2c0698dca78c39c4943a8314 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Tue, 2 Jul 2024 19:26:53 +0200 Subject: [PATCH 48/75] [enhancement] address new warning in LocalOutlierFactor in sklearn 1.6 (#1894) * Update _lof.py * Add explaining comment with sklearn PR link --------- Co-authored-by: Alexander Andreev --- sklearnex/neighbors/_lof.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py index 29bf61df7e..0a9a72ee1e 100644 --- a/sklearnex/neighbors/_lof.py +++ b/sklearnex/neighbors/_lof.py @@ -97,6 +97,15 @@ def _onedal_fit(self, X, y, queue=None): self.negative_outlier_factor_, 100.0 * self.contamination ) + # adoption of warning for data with duplicated samples from + # https://github.com/scikit-learn/scikit-learn/pull/28773 + if sklearn_check_version("1.6"): + if np.min(self.negative_outlier_factor_) < -1e7 and not self.novelty: + warnings.warn( + "Duplicate values are leading to incorrect results. " + "Increase the number of neighbors for more accurate results." + ) + return self def fit(self, X, y=None): From 441873b5627286a6ef50ffc3149535d2698327ff Mon Sep 17 00:00:00 2001 From: Samir Nasibli Date: Wed, 3 Jul 2024 10:54:17 +0200 Subject: [PATCH 49/75] TEST: enable import tests for dataframes testing in `sklearnex.cluster.Kmeans` (#1888) * TEST: enable import tests for dataframes testing in sklearnex.cluster.KMeans * using pytest.skip instead of device_filter * enabled GPU testing added comment about GPU offloading --- sklearnex/cluster/k_means.py | 2 ++ sklearnex/cluster/tests/test_kmeans.py | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index eac4c22bce..eb94fb5e9f 100755 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -17,6 +17,8 @@ from daal4py.sklearn.cluster import KMeans from onedal._device_offload import support_usm_ndarray +# Note: `sklearnex.cluster.KMeans` only has functional +# sycl GPU support. No GPU device will be offloaded. KMeans.fit = support_usm_ndarray(queue_param=False)(KMeans.fit) KMeans.fit_predict = support_usm_ndarray(queue_param=False)(KMeans.fit_predict) KMeans.predict = support_usm_ndarray(queue_param=False)(KMeans.predict) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index 0424ee9e82..4555268c8b 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -15,16 +15,28 @@ # =============================================================================== import numpy as np +import pytest from numpy.testing import assert_allclose +from onedal.tests.utils._dataframes_support import ( + _as_numpy, + _convert_to_dataframe, + get_dataframes_and_queues, +) + + +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +def test_sklearnex_import(dataframe, queue): -def test_sklearnex_import(): from sklearnex.cluster import KMeans X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) kmeans = KMeans(n_clusters=2, random_state=0).fit(X) assert "daal4py" in kmeans.__module__ - result = kmeans.predict([[0, 0], [12, 3]]) + X_test = [[0, 0], [12, 3]] + X_test = _convert_to_dataframe(X_test, sycl_queue=queue, target_df=dataframe) + result = kmeans.predict(X_test) expected = np.array([1, 0], dtype=np.int32) - assert_allclose(expected, result) + assert_allclose(expected, _as_numpy(result)) From 2cd3be447f80da891c3275c9509d7b66ec2ab339 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 3 Jul 2024 11:46:18 +0100 Subject: [PATCH 50/75] Change SVM._fit_proba implementation (#1879) * Change SVM._fit_proba implementation * Add TODO comment --- sklearnex/svm/_common.py | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/sklearnex/svm/_common.py b/sklearnex/svm/_common.py index e0e7d8c939..7c693365c3 100644 --- a/sklearnex/svm/_common.py +++ b/sklearnex/svm/_common.py @@ -22,7 +22,6 @@ from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import r2_score -from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import LabelEncoder from daal4py.sklearn._utils import sklearn_check_version @@ -230,6 +229,7 @@ def _compute_balanced_class_weight(self, y): return recip_freq[le.transform(classes)] def _fit_proba(self, X, y, sample_weight=None, queue=None): + # TODO: rewrite this method when probabilities output is implemented in oneDAL params = self.get_params() params["probability"] = False params["decision_function_shape"] = "ovr" @@ -240,26 +240,13 @@ def _fit_proba(self, X, y, sample_weight=None, queue=None): cfg = get_config() cfg["target_offload"] = queue with config_context(**cfg): - try: - n_splits = 5 - n_jobs = n_splits if queue is None or queue.sycl_device.is_cpu else 1 - cv = StratifiedKFold( - n_splits=n_splits, shuffle=True, random_state=self.random_state - ) - self.clf_prob = CalibratedClassifierCV( - clf_base, - ensemble=False, - cv=cv, - method="sigmoid", - ) - self.clf_prob.fit(X, y, sample_weight) - - except ValueError: - clf_base = clf_base.fit(X, y, sample_weight) - self.clf_prob = CalibratedClassifierCV( - clf_base, cv="prefit", method="sigmoid" - ) - self.clf_prob.fit(X, y, sample_weight) + clf_base.fit(X, y) + self.clf_prob = CalibratedClassifierCV( + clf_base, + ensemble=False, + cv="prefit", + method="sigmoid", + ).fit(X, y) def _save_attributes(self): self.support_vectors_ = self._onedal_estimator.support_vectors_ From ea8c58359cde7578042139d2ad1263b219f212cd Mon Sep 17 00:00:00 2001 From: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Date: Thu, 4 Jul 2024 07:28:44 -0700 Subject: [PATCH 51/75] CI: conda channel revisions for public CI (#1915) --- .ci/pipeline/build-and-test-lnx.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/pipeline/build-and-test-lnx.yml b/.ci/pipeline/build-and-test-lnx.yml index 7f9c8ae3a2..93651f2656 100644 --- a/.ci/pipeline/build-and-test-lnx.yml +++ b/.ci/pipeline/build-and-test-lnx.yml @@ -24,7 +24,7 @@ steps: displayName: "System info" - script: | conda update -y -q conda - conda create -q -y -n CB -c conda-forge -c intel python=$(PYTHON_VERSION) intel::dal-devel mpich pyyaml "dpcpp-cpp-rt=2024.2.0" + conda create -q -y -n CB -c conda-forge python=$(PYTHON_VERSION) intel::dal-devel mpich pyyaml "dpcpp-cpp-rt=2024.2.0" displayName: "Conda create" - script: | . /usr/share/miniconda/etc/profile.d/conda.sh From aaad3870551d3a5f729818cdb343390d3f941196 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Thu, 4 Jul 2024 17:10:47 +0100 Subject: [PATCH 52/75] Fix for Logistic Regression loss scaling (#1908) * Fix for LogReg loss scaling * Correct deselected test name --- daal4py/sklearn/linear_model/logistic_path.py | 24 +++++++++++++------ deselected_tests.yaml | 6 +++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/daal4py/sklearn/linear_model/logistic_path.py b/daal4py/sklearn/linear_model/logistic_path.py index 519279effb..9d993eae17 100755 --- a/daal4py/sklearn/linear_model/logistic_path.py +++ b/daal4py/sklearn/linear_model/logistic_path.py @@ -424,6 +424,10 @@ def __logistic_regression_path( (classes.size, n_features + int(fit_intercept)), order="F", dtype=X.dtype ) + # Adoption of https://github.com/scikit-learn/scikit-learn/pull/26721 + if solver in ["lbfgs", "newton-cg", "newton-cholesky"]: + sw_sum = len(X) if sample_weight is None else np.sum(sample_weight) + if coef is not None: # it must work both giving the bias term and not if multi_class == "ovr": @@ -590,7 +594,7 @@ def grad(x, *args): X, target, 0.0, - 1.0 / (2 * C * C_daal_multiplier), + 1.0 / (2 * C * C_daal_multiplier * sw_sum), fit_intercept, value=True, gradient=True, @@ -598,10 +602,10 @@ def grad(x, *args): ) else: if sklearn_check_version("1.1"): - l2_reg_strength = 1.0 / C + l2_reg_strength = 1.0 / (C * sw_sum) extra_args = (X, target, sample_weight, l2_reg_strength, n_threads) else: - extra_args = (X, target, 1.0 / C, sample_weight) + extra_args = (X, target, 1.0 / (C * sw_sum), sample_weight) iprint = [-1, 50, 1, 100, 101][ np.searchsorted(np.array([0, 1, 2, 3]), verbose) @@ -612,7 +616,13 @@ def grad(x, *args): method="L-BFGS-B", jac=True, args=extra_args, - options={"iprint": iprint, "gtol": tol, "maxiter": max_iter}, + options={ + "maxiter": max_iter, + "maxls": 50, + "iprint": iprint, + "gtol": tol, + "ftol": 64 * np.finfo(float).eps, + }, ) n_iter_i = _check_optimize_result( solver, @@ -627,7 +637,7 @@ def grad(x, *args): if _dal_ready: def make_ncg_funcs(f, value=False, gradient=False, hessian=False): - daal_penaltyL2 = 1.0 / (2 * C * C_daal_multiplier) + daal_penaltyL2 = 1.0 / (2 * C * C_daal_multiplier * sw_sum) _obj_, X_, y_, n_samples = daal_extra_args_func( classes.size, w0, @@ -660,10 +670,10 @@ def _func_(x, *args): ) else: if sklearn_check_version("1.1"): - l2_reg_strength = 1.0 / C + l2_reg_strength = 1.0 / (C * sw_sum) args = (X, target, sample_weight, l2_reg_strength, n_threads) else: - args = (X, target, 1.0 / C, sample_weight) + args = (X, target, 1.0 / (C * sw_sum), sample_weight) w0, n_iter_i = _newton_cg( hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol diff --git a/deselected_tests.yaml b/deselected_tests.yaml index b1005cfa37..344247a9fe 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -190,6 +190,12 @@ deselected_tests: # margin above the test threshold, see https://github.com/scikit-learn/scikit-learn/pull/13645 - linear_model/tests/test_logistic.py::test_dtype_match + # Logistic Regression coeffs change due to fix for loss scaling + # (https://github.com/scikit-learn/scikit-learn/pull/26721) + - feature_selection/tests/test_from_model.py::test_importance_getter[estimator0-named_steps.logisticregression.coef_] + - inspection/_plot/tests/test_boundary_decision_display.py::test_class_of_interest_binary[predict_proba] + - linear_model/tests/test_sag.py::test_sag_pobj_matches_logistic_regression + # This fails on certain platforms. While weighted data does not go through DAAL, # unweighted does. Since convergence does not occur (comment in the test # suggests that) and because coefficients are slightly different, From 1c691dc66024f390b7bc735f73b8fbc003acd73c Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Thu, 4 Jul 2024 17:11:21 +0100 Subject: [PATCH 53/75] Update dependency scikit-learn to v1.5.1 (#1912) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 42ca0e9ff1..7676434e32 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,7 +5,7 @@ numpy>=1.21.6 ; python_version == '3.10' numpy>=1.23.5 ; python_version == '3.11' numpy>=2.0.0 ; python_version >= '3.12' scikit-learn==1.2.2 ; python_version == '3.8' -scikit-learn==1.5.0 ; python_version >= '3.9' +scikit-learn==1.5.1 ; python_version >= '3.9' pandas==2.0.3 ; python_version == '3.8' pandas==2.1.3 ; python_version >= '3.9' and python_version < '3.11' pandas==2.2.2 ; python_version >= '3.11' From 1b4fae2442abcf12269fcf710a13a4c706889945 Mon Sep 17 00:00:00 2001 From: Samir Nasibli Date: Thu, 4 Jul 2024 19:25:44 +0200 Subject: [PATCH 54/75] TEST: using `pytest.skip` instead of `device_filter` (#1892) * TEST: using pytest.skip instead of device_filter * Update onedal/datatypes/tests/test_data.py * updated for test_random_forest * added docstrings to tests utilities added docstring for get_queues added docstring for get_dataframes_and_queues * removed incorrect comment --- onedal/datatypes/tests/test_data.py | 25 +++++++++------ onedal/ensemble/tests/test_random_forest.py | 13 ++++---- .../tests/test_knn_classification.py | 5 +-- .../primitives/tests/test_kernel_functions.py | 10 +++--- onedal/svm/tests/test_svc.py | 10 +++--- onedal/tests/utils/_dataframes_support.py | 29 +++++++++++++++++ onedal/tests/utils/_device_selection.py | 19 +++++++++++ sklearnex/ensemble/tests/test_forest.py | 21 ++++++------ sklearnex/linear_model/tests/test_logreg.py | 3 +- sklearnex/svm/tests/test_svm.py | 32 +++++++------------ 10 files changed, 107 insertions(+), 60 deletions(-) diff --git a/onedal/datatypes/tests/test_data.py b/onedal/datatypes/tests/test_data.py index 97c4b9ccf2..17e182bc7a 100644 --- a/onedal/datatypes/tests/test_data.py +++ b/onedal/datatypes/tests/test_data.py @@ -46,10 +46,11 @@ def _test_input_format_c_contiguous_numpy(queue, dtype): assert_allclose(expected, result) -# TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("queue", get_queues()) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_c_contiguous_numpy(queue, dtype): + if queue and queue.sycl_device.is_gpu: + pytest.skip("Sporadic failures on GPU sycl_queue.") _test_input_format_c_contiguous_numpy(queue, dtype) @@ -67,10 +68,11 @@ def _test_input_format_f_contiguous_numpy(queue, dtype): assert_allclose(expected, result) -# TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("queue", get_queues()) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_f_contiguous_numpy(queue, dtype): + if queue and queue.sycl_device.is_gpu: + pytest.skip("Sporadic failures on GPU sycl_queue.") _test_input_format_f_contiguous_numpy(queue, dtype) @@ -92,10 +94,11 @@ def _test_input_format_c_not_contiguous_numpy(queue, dtype): assert_allclose(expected, result) -# TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("queue", get_queues()) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_c_not_contiguous_numpy(queue, dtype): + if queue and queue.sycl_device.is_gpu: + pytest.skip("Sporadic failures on GPU sycl_queue.") _test_input_format_c_not_contiguous_numpy(queue, dtype) @@ -115,10 +118,11 @@ def _test_input_format_c_contiguous_pandas(queue, dtype): assert_allclose(expected, result) -# TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("queue", get_queues()) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_c_contiguous_pandas(queue, dtype): + if queue and queue.sycl_device.is_gpu: + pytest.skip("Sporadic failures on GPU sycl_queue.") _test_input_format_c_contiguous_pandas(queue, dtype) @@ -138,10 +142,11 @@ def _test_input_format_f_contiguous_pandas(queue, dtype): assert_allclose(expected, result) -# TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("queue", get_queues()) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_f_contiguous_pandas(queue, dtype): + if queue and queue.sycl_device.is_gpu: + pytest.skip("Sporadic failures on GPU sycl_queue.") _test_input_format_f_contiguous_pandas(queue, dtype) diff --git a/onedal/ensemble/tests/test_random_forest.py b/onedal/ensemble/tests/test_random_forest.py index 2659481662..3a6937de31 100644 --- a/onedal/ensemble/tests/test_random_forest.py +++ b/onedal/ensemble/tests/test_random_forest.py @@ -37,10 +37,10 @@ def test_rf_classifier(queue): assert_allclose([1], rf.predict([[0, 0, 0, 0]], queue=queue)) -# TODO: -# fix RF regressor predict for the GPU sycl_queue. -@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("queue", get_queues()) def test_rf_regression(queue): + if queue and queue.sycl_device.is_gpu: + pytest.skip("RF regressor predict for the GPU sycl_queue is buggy.") X, y = make_regression( n_samples=100, n_features=4, n_informative=2, random_state=0, shuffle=False ) @@ -79,11 +79,12 @@ def test_rf_classifier_random_splitter(queue): assert_allclose([1], rf.predict([[0, 0, 0, 0]], queue=queue)) -# TODO: -# fix RF regressor predict for the GPU sycl_queue. -@pytest.mark.skip(reason="fix RF regressor predict for the GPU sycl_queue") @pytest.mark.parametrize("queue", get_queues("gpu")) def test_rf_regression_random_splitter(queue): + # splitter_mode selection only for GPU enabled. + # For CPU only `best` mode is supported. + if queue and queue.sycl_device.is_gpu: + pytest.skip("RF regressor predict for the GPU sycl_queue is buggy.") X, y = make_regression( n_samples=100, n_features=4, n_informative=2, random_state=0, shuffle=False ) diff --git a/onedal/neighbors/tests/test_knn_classification.py b/onedal/neighbors/tests/test_knn_classification.py index 8941f49965..d29bdab345 100755 --- a/onedal/neighbors/tests/test_knn_classification.py +++ b/onedal/neighbors/tests/test_knn_classification.py @@ -31,9 +31,10 @@ def test_iris(queue): assert_array_equal(clf.classes_, np.sort(clf.classes_)) -# TODO: investigate failures on GPU -@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("queue", get_queues()) def test_pickle(queue): + if queue and queue.sycl_device.is_gpu: + pytest.skip("KNN classifier pickling for the GPU sycl_queue is buggy.") iris = datasets.load_iris() clf = KNeighborsClassifier(2).fit(iris.data, iris.target, queue=queue) expected = clf.predict(iris.data, queue=queue) diff --git a/onedal/primitives/tests/test_kernel_functions.py b/onedal/primitives/tests/test_kernel_functions.py index 661f3b8698..22a8f562cb 100644 --- a/onedal/primitives/tests/test_kernel_functions.py +++ b/onedal/primitives/tests/test_kernel_functions.py @@ -26,9 +26,10 @@ ) -# TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("queue", get_queues()) def test_dense_self_linear_kernel(queue): + if queue and queue.sycl_device.is_gpu: + pytest.skip("Linear kernel for the GPU sycl_queue is buggy.") rng = np.random.RandomState(0) X = np.array(5 * rng.random_sample((10, 4))) @@ -49,12 +50,13 @@ def _test_dense_small_linear_kernel(queue, scale, shift, dtype): assert_allclose(result, expected, rtol=tol) -# TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("queue", get_queues()) @pytest.mark.parametrize("scale", [1.0, 2.0]) @pytest.mark.parametrize("shift", [0.0, 1.0]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_dense_small_linear_kernel(queue, scale, shift, dtype): + if queue and queue.sycl_device.is_gpu: + pytest.skip("Linear kernel for the GPU sycl_queue is buggy.") _test_dense_small_linear_kernel(queue, scale, shift, dtype) diff --git a/onedal/svm/tests/test_svc.py b/onedal/svm/tests/test_svc.py index acda4114cf..9f7eaa4810 100644 --- a/onedal/svm/tests/test_svc.py +++ b/onedal/svm/tests/test_svc.py @@ -42,11 +42,12 @@ def _test_libsvm_parameters(queue, array_constr, dtype): assert_array_equal(clf.predict(X), y) -# TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("queue", get_queues()) @pytest.mark.parametrize("array_constr", [np.array]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_libsvm_parameters(queue, array_constr, dtype): + if queue and queue.sycl_device.is_gpu: + pytest.skip("Sporadic failures on GPU sycl_queue.") _test_libsvm_parameters(queue, array_constr, dtype) @@ -72,9 +73,10 @@ def test_class_weight(queue): assert_array_almost_equal(clf.predict(X, queue=queue), [2] * 6) -# TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("queue", get_queues()) def test_sample_weight(queue): + if queue and queue.sycl_device.is_gpu: + pytest.skip("Sporadic failures on GPU sycl_queue.") X = np.array([[-2, 0], [-1, -1], [0, -2], [0, 2], [1, 1], [2, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) diff --git a/onedal/tests/utils/_dataframes_support.py b/onedal/tests/utils/_dataframes_support.py index f6ffca4341..82da331bb8 100644 --- a/onedal/tests/utils/_dataframes_support.py +++ b/onedal/tests/utils/_dataframes_support.py @@ -59,6 +59,34 @@ def get_dataframes_and_queues( dataframe_filter_="numpy,pandas,dpnp,dpctl", device_filter_="cpu,gpu" ): + """Get supported dataframes for testing. + + This is meant to be used for testing purposes only. + + Parameters + ---------- + dataframe_filter_ : str, default="numpy,pandas,dpnp,dpctl" + Configure output pytest.params for the certain dataframe formats. + device_filter_ : str, default="cpu,gpu" + Configure output pytest.params with certain sycl queue for the dataframe, + where it is applicable. + + Returns + ------- + list[pytest.param] + The list of pytest params, included dataframe name (str), + sycl queue, if applicable for the test case, and test + case id (str). + + Notes + ----- + Do not use filters for the test cases disabling. Use `pytest.skip` + or `pytest.xfail` instead. + + See Also + -------- + _convert_to_dataframe : Converted input object to certain dataframe format. + """ dataframes_and_queues = [] if "numpy" in dataframe_filter_: @@ -95,6 +123,7 @@ def _as_numpy(obj, *args, **kwargs): def _convert_to_dataframe(obj, sycl_queue=None, target_df=None, *args, **kwargs): + """Converted input object to certain dataframe format.""" if target_df is None: return obj elif target_df == "numpy": diff --git a/onedal/tests/utils/_device_selection.py b/onedal/tests/utils/_device_selection.py index dcc3236e88..931324806c 100644 --- a/onedal/tests/utils/_device_selection.py +++ b/onedal/tests/utils/_device_selection.py @@ -20,6 +20,25 @@ def get_queues(filter_="cpu,gpu"): + """Get available dpctl.SycQueues for testing. + + This is meant to be used for testing purposes only. + + Parameters + ---------- + filter_ : str, default="cpu,gpu" + Configure output list with available dpctl.SycQueues for testing. + + Returns + ------- + list[dpctl.SycQueue] + The list of dpctl.SycQueue. + + Notes + ----- + Do not use filters for the test cases disabling. Use `pytest.skip` + or `pytest.xfail` instead. + """ queues = [None] if "cpu" in filter_ else [] try: diff --git a/sklearnex/ensemble/tests/test_forest.py b/sklearnex/ensemble/tests/test_forest.py index 80e0e1f61b..14cc8c000e 100644 --- a/sklearnex/ensemble/tests/test_forest.py +++ b/sklearnex/ensemble/tests/test_forest.py @@ -46,11 +46,10 @@ def test_sklearnex_import_rf_classifier(dataframe, queue): assert_allclose([1], _as_numpy(rf.predict([[0, 0, 0, 0]]))) -# TODO: fix RF regressor predict for the GPU sycl_queue. -@pytest.mark.parametrize( - "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu") -) +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) def test_sklearnex_import_rf_regression(dataframe, queue): + if queue and queue.sycl_device.is_gpu: + pytest.skip("RF regressor predict for the GPU sycl_queue is buggy.") from sklearnex.ensemble import RandomForestRegressor X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False) @@ -69,11 +68,10 @@ def test_sklearnex_import_rf_regression(dataframe, queue): assert_allclose([-6.839], pred, atol=1e-2) -# TODO: fix ET classifier predict for the GPU sycl_queue. -@pytest.mark.parametrize( - "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu") -) +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) def test_sklearnex_import_et_classifier(dataframe, queue): + if queue and queue.sycl_device.is_gpu: + pytest.skip("ET classifier predict for the GPU sycl_queue is buggy.") from sklearnex.ensemble import ExtraTreesClassifier X, y = make_classification( @@ -93,11 +91,10 @@ def test_sklearnex_import_et_classifier(dataframe, queue): assert_allclose([1], _as_numpy(rf.predict([[0, 0, 0, 0]]))) -# TODO: fix ET regressor predict for the GPU sycl_queue. -@pytest.mark.parametrize( - "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu") -) +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) def test_sklearnex_import_et_regression(dataframe, queue): + if queue and queue.sycl_device.is_gpu: + pytest.skip("ET regressor predict for the GPU sycl_queue is buggy.") from sklearnex.ensemble import ExtraTreesRegressor X, y = make_regression(n_features=1, random_state=0, shuffle=False) diff --git a/sklearnex/linear_model/tests/test_logreg.py b/sklearnex/linear_model/tests/test_logreg.py index 261a0f4b8f..d75913f645 100755 --- a/sklearnex/linear_model/tests/test_logreg.py +++ b/sklearnex/linear_model/tests/test_logreg.py @@ -38,8 +38,7 @@ def prepare_input(X, y, dataframe, queue): @pytest.mark.parametrize( - "dataframe,queue", - get_dataframes_and_queues(device_filter_="cpu"), + "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu") ) def test_sklearnex_multiclass_classification(dataframe, queue): from sklearnex.linear_model import LogisticRegression diff --git a/sklearnex/svm/tests/test_svm.py b/sklearnex/svm/tests/test_svm.py index 603951f89a..f0d561744e 100755 --- a/sklearnex/svm/tests/test_svm.py +++ b/sklearnex/svm/tests/test_svm.py @@ -25,12 +25,10 @@ ) -# TODO: -# investigate failure for `dpnp.ndarrays` and `dpctl.tensors` on `GPU` -@pytest.mark.parametrize( - "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu") -) +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) def test_sklearnex_import_svc(dataframe, queue): + if queue and queue.sycl_device.is_gpu: + pytest.skip("SVC fit for the GPU sycl_queue is buggy.") from sklearnex.svm import SVC X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]]) @@ -43,12 +41,10 @@ def test_sklearnex_import_svc(dataframe, queue): assert_allclose(_as_numpy(svc.support_), [1, 3]) -# TODO: -# investigate failure for `dpnp.ndarrays` and `dpctl.tensors` on `GPU` -@pytest.mark.parametrize( - "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu") -) +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) def test_sklearnex_import_nusvc(dataframe, queue): + if queue and queue.sycl_device.is_gpu: + pytest.skip("NuSVC fit for the GPU sycl_queue is buggy.") from sklearnex.svm import NuSVC X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]]) @@ -63,12 +59,10 @@ def test_sklearnex_import_nusvc(dataframe, queue): assert_allclose(_as_numpy(svc.support_), [0, 1, 3, 4]) -# TODO: -# investigate failure for `dpnp.ndarrays` and `dpctl.tensors` on `GPU` -@pytest.mark.parametrize( - "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu") -) +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) def test_sklearnex_import_svr(dataframe, queue): + if queue and queue.sycl_device.is_gpu: + pytest.skip("SVR fit for the GPU sycl_queue is buggy.") from sklearnex.svm import SVR X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]]) @@ -81,12 +75,10 @@ def test_sklearnex_import_svr(dataframe, queue): assert_allclose(_as_numpy(svc.support_), [1, 3]) -# TODO: -# investigate failure for `dpnp.ndarrays` and `dpctl.tensors` on `GPU` -@pytest.mark.parametrize( - "dataframe,queue", get_dataframes_and_queues(device_filter_="cpu") -) +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) def test_sklearnex_import_nusvr(dataframe, queue): + if queue and queue.sycl_device.is_gpu: + pytest.skip("NuSVR fit for the GPU sycl_queue is buggy.") from sklearnex.svm import NuSVR X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]]) From 01def265ba59d7d4e1eb2e5944d938e274d1bde8 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Thu, 4 Jul 2024 18:26:22 +0100 Subject: [PATCH 55/75] Update dependency cmake to v3.30.0 (#1914) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- dependencies-dev | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependencies-dev b/dependencies-dev index 043e53ae71..f7bff516ac 100644 --- a/dependencies-dev +++ b/dependencies-dev @@ -3,4 +3,4 @@ Jinja2==3.1.4 numpy==1.19.5 ; python_version < '3.9' numpy==2.0.0 ; python_version >= '3.9' pybind11==2.13.1 -cmake==3.29.6 +cmake==3.30.0 From 68bdeab1f334d8f4770f7f86bbadaccccb348ba7 Mon Sep 17 00:00:00 2001 From: Samir Nasibli Date: Fri, 5 Jul 2024 15:29:39 +0200 Subject: [PATCH 56/75] FIX: fix for test_on_gold_data_unbiased (#1917) --- onedal/covariance/tests/test_incremental_covariance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/covariance/tests/test_incremental_covariance.py b/onedal/covariance/tests/test_incremental_covariance.py index 27cab31fbc..c7f04ca8f0 100644 --- a/onedal/covariance/tests/test_incremental_covariance.py +++ b/onedal/covariance/tests/test_incremental_covariance.py @@ -27,8 +27,8 @@ def test_on_gold_data_unbiased(queue, dtype): from onedal.covariance import IncrementalEmpiricalCovariance X = np.array([[0, 1], [0, 1]]) - X_split = np.array_split(X, 2) X = X.astype(dtype) + X_split = np.array_split(X, 2) inccov = IncrementalEmpiricalCovariance() for i in range(2): From aec758a21944c68e37a6eea86474189344267ee8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 7 Jul 2024 10:56:22 +0100 Subject: [PATCH 57/75] Bump certifi from 2024.2.2 to 2024.7.4 (#1920) Bumps [certifi](https://github.com/certifi/python-certifi) from 2024.2.2 to 2024.7.4. - [Commits](https://github.com/certifi/python-certifi/compare/2024.02.02...2024.07.04) --- updated-dependencies: - dependency-name: certifi dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements-doc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-doc.txt b/requirements-doc.txt index 17abb1a46c..a001cd9c5b 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -5,7 +5,7 @@ Babel==2.14.0 backcall==0.2.0 beautifulsoup4==4.12.3 bleach==6.1.0 -certifi==2024.2.2 +certifi==2024.7.4 charset-normalizer==3.3.2 click==8.1.7 decorator==5.1.1 From c44e9f591141649d94407a231968cb6981491010 Mon Sep 17 00:00:00 2001 From: olegkkruglov <102592747+olegkkruglov@users.noreply.github.com> Date: Mon, 8 Jul 2024 02:54:08 -0700 Subject: [PATCH 58/75] ENH: `BasicStatistics` API change (#1644) * Refactored and changed interface for BasicStatistics * Extend tests for BasicStatistics --- examples/sklearnex/basic_statistics_spmd.py | 6 +- onedal/basic_statistics/basic_statistics.cpp | 79 ++-- onedal/basic_statistics/basic_statistics.py | 93 ++-- .../incremental_basic_statistics.py | 56 +-- .../tests/test_basic_statistics.py | 400 ++++++++++++------ .../test_incremental_basic_statistics.py | 64 +-- onedal/cluster/kmeans.py | 3 +- .../spmd/basic_statistics/basic_statistics.py | 6 + .../basic_statistics/basic_statistics.py | 117 ++++- .../tests/test_basic_statistics.py | 251 +++++++++++ .../test_incremental_basic_statistics.py | 2 +- 11 files changed, 747 insertions(+), 330 deletions(-) create mode 100644 sklearnex/basic_statistics/tests/test_basic_statistics.py diff --git a/examples/sklearnex/basic_statistics_spmd.py b/examples/sklearnex/basic_statistics_spmd.py index 29864aea62..909c842cb9 100644 --- a/examples/sklearnex/basic_statistics_spmd.py +++ b/examples/sklearnex/basic_statistics_spmd.py @@ -58,7 +58,7 @@ def generate_data(par, size, seed=777): gtr_std = np.std(weighted_data, axis=0) bss = BasicStatisticsSpmd(["mean", "standard_deviation"]) -res = bss.compute(dpt_data, dpt_weights) +bss.fit(dpt_data, dpt_weights) -print(f"Computed mean on rank {rank}:\n", res["mean"]) -print(f"Computed std on rank {rank}:\n", res["standard_deviation"]) +print(f"Computed mean on rank {rank}:\n", bss.mean) +print(f"Computed std on rank {rank}:\n", bss.standard_deviation) diff --git a/onedal/basic_statistics/basic_statistics.cpp b/onedal/basic_statistics/basic_statistics.cpp index 35805a78ac..80a35dc17e 100644 --- a/onedal/basic_statistics/basic_statistics.cpp +++ b/onedal/basic_statistics/basic_statistics.cpp @@ -129,26 +129,21 @@ struct params2desc_incremental { }; template -struct init_compute_ops_dispatcher {}; - -template -struct init_compute_ops_dispatcher { - void operator()(py::module_& m) { - using Task = dal::basic_statistics::task::compute; - - m.def("train", - [](const Policy& policy, - const py::dict& params, - const table& data, - const table& weights) { - using namespace dal::basic_statistics; - using input_t = compute_input; - - compute_ops ops(policy, input_t{ data, weights }, params2desc{}); - return fptype2t{ method2t{ Task{}, ops } }(params); - }); - } -}; +void init_compute_ops(py::module& m) { + m.def("compute", []( + const Policy& policy, + const py::dict& params, + const table& data, + const table& weights) { + using namespace dal::basic_statistics; + using input_t = compute_input; + + compute_ops ops(policy, input_t{ data, weights }, params2desc{}); + return fptype2t{ method2t{ Task{}, ops } }(params); + } + ); +} + template void init_partial_compute_ops(py::module& m) { @@ -177,28 +172,23 @@ void init_finalize_compute_ops(pybind11::module_& m) { }); } -template -void init_compute_ops(py::module& m) { - init_compute_ops_dispatcher{}(m); -} - template void init_compute_result(py::module_& m) { using namespace dal::basic_statistics; using result_t = compute_result; - auto cls = py::class_(m, "compute_result") - .def(py::init()) - .DEF_ONEDAL_PY_PROPERTY(min, result_t) - .DEF_ONEDAL_PY_PROPERTY(max, result_t) - .DEF_ONEDAL_PY_PROPERTY(sum, result_t) - .DEF_ONEDAL_PY_PROPERTY(mean, result_t) - .DEF_ONEDAL_PY_PROPERTY(variance, result_t) - .DEF_ONEDAL_PY_PROPERTY(variation, result_t) - .DEF_ONEDAL_PY_PROPERTY(sum_squares, result_t) - .DEF_ONEDAL_PY_PROPERTY(standard_deviation, result_t) - .DEF_ONEDAL_PY_PROPERTY(sum_squares_centered, result_t) - .DEF_ONEDAL_PY_PROPERTY(second_order_raw_moment, result_t); + py::class_(m, "compute_result") + .def(py::init()) + .DEF_ONEDAL_PY_PROPERTY(min, result_t) + .DEF_ONEDAL_PY_PROPERTY(max, result_t) + .DEF_ONEDAL_PY_PROPERTY(sum, result_t) + .DEF_ONEDAL_PY_PROPERTY(mean, result_t) + .DEF_ONEDAL_PY_PROPERTY(variance, result_t) + .DEF_ONEDAL_PY_PROPERTY(variation, result_t) + .DEF_ONEDAL_PY_PROPERTY(sum_squares, result_t) + .DEF_ONEDAL_PY_PROPERTY(standard_deviation, result_t) + .DEF_ONEDAL_PY_PROPERTY(sum_squares_centered, result_t) + .DEF_ONEDAL_PY_PROPERTY(second_order_raw_moment, result_t); } template @@ -230,21 +220,18 @@ ONEDAL_PY_INIT_MODULE(basic_statistics) { using namespace dal::basic_statistics; auto sub = m.def_submodule("basic_statistics"); - using task_list = types; #ifdef ONEDAL_DATA_PARALLEL_SPMD - ONEDAL_PY_INSTANTIATE(init_compute_ops, sub, policy_spmd, task_list); + ONEDAL_PY_INSTANTIATE(init_compute_ops, sub, policy_spmd, task::compute); #else // ONEDAL_DATA_PARALLEL_SPMD - ONEDAL_PY_INSTANTIATE(init_compute_ops, sub, policy_list, task_list); - ONEDAL_PY_INSTANTIATE(init_partial_compute_ops, sub, policy_list, task_list); - ONEDAL_PY_INSTANTIATE(init_finalize_compute_ops, sub, policy_list, task_list); - ONEDAL_PY_INSTANTIATE(init_compute_result, sub, task_list); - ONEDAL_PY_INSTANTIATE(init_partial_compute_result, sub, task_list); + ONEDAL_PY_INSTANTIATE(init_compute_ops, sub, policy_list, task::compute); + ONEDAL_PY_INSTANTIATE(init_partial_compute_ops, sub, policy_list, task::compute); + ONEDAL_PY_INSTANTIATE(init_finalize_compute_ops, sub, policy_list, task::compute); + ONEDAL_PY_INSTANTIATE(init_compute_result, sub, task::compute); + ONEDAL_PY_INSTANTIATE(init_partial_compute_result, sub, task::compute); #endif // ONEDAL_DATA_PARALLEL_SPMD } -ONEDAL_PY_TYPE2STR(dal::basic_statistics::task::compute, "compute"); - #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230100 } // namespace oneapi::dal::python diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index c1c121d932..4430338044 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -14,19 +14,18 @@ # limitations under the License. # ============================================================================== +import warnings from abc import ABCMeta, abstractmethod -from numbers import Number import numpy as np -from onedal import _backend - from ..common._base import BaseEstimator from ..datatypes import _convert_to_supported, from_table, to_table from ..utils import _is_csr +from ..utils.validation import _check_array -class BaseBasicStatistics(metaclass=ABCMeta): +class BaseBasicStatistics(BaseEstimator, metaclass=ABCMeta): @abstractmethod def __init__(self, result_options, algorithm): self.options = result_options @@ -63,59 +62,67 @@ def _get_onedal_params(self, is_csr, dtype=np.float32): "result_option": options, } - def _compute_raw( - self, data_table, weights_table, module, policy, dtype=np.float32, is_csr=False - ): - params = self._get_onedal_params(is_csr, dtype) - - result = module.train(policy, params, data_table, weights_table) - options = self._get_result_options(self.options) - options = options.split("|") +class BasicStatistics(BaseBasicStatistics): + """ + Basic Statistics oneDAL implementation. + """ - return {opt: getattr(result, opt) for opt in options} + def __init__(self, result_options="all", algorithm="by_default"): + super().__init__(result_options, algorithm) - def _compute(self, data, weights, module, queue): - policy = self._get_policy(queue, data, weights) + def fit(self, data, sample_weight=None, queue=None): + policy = self._get_policy(queue, data, sample_weight) is_csr = _is_csr(data) - if not (data is None) and not is_csr: - data = np.asarray(data) - if not (weights is None): - weights = np.asarray(weights) + if data is not None and not is_csr: + data = _check_array(data, ensure_2d=False) + if sample_weight is not None: + sample_weight = _check_array(sample_weight, ensure_2d=False) - data, weights = _convert_to_supported(policy, data, weights) - - data_table, weights_table = to_table(data, weights) + data, sample_weight = _convert_to_supported(policy, data, sample_weight) + is_single_dim = data.ndim == 1 + data_table, weights_table = to_table(data, sample_weight) dtype = data.dtype - res = self._compute_raw(data_table, weights_table, module, policy, dtype, is_csr) + raw_result = self._compute_raw(data_table, weights_table, policy, dtype, is_csr) + for opt, raw_value in raw_result.items(): + value = from_table(raw_value).ravel() + if is_single_dim: + setattr(self, opt, value[0]) + else: + setattr(self, opt, value) - return {k: from_table(v).ravel() for k, v in res.items()} + return self + + def compute(self, data, weights=None, queue=None): + warnings.warn( + "Method `compute` was deprecated in version 2024.7 and will be " + "removed in 2025.0. Use `fit` instead." + ) + is_csr = _is_csr(data) -class BasicStatistics(BaseEstimator, BaseBasicStatistics): - """ - Basic Statistics oneDAL implementation. - """ + if data is not None: + data = _check_array(data, ensure_2d=False) + if weights is not None: + weights = _check_array(weights, ensure_2d=False) - def __init__(self, result_options="all", *, algorithm="by_default", **kwargs): - super().__init__(result_options, algorithm) + policy = self._get_policy(queue, data, weights) + data, weights = _convert_to_supported(policy, data, weights) + data_table, weights_table = to_table(data, weights) + dtype = data.dtype + res = self._compute_raw(data_table, weights_table, policy, dtype, is_csr) - def compute(self, data, weights=None, queue=None): - return super()._compute( - data, weights, self._get_backend("basic_statistics", "compute", None), queue - ) + return {k: from_table(v).ravel() for k, v in res.items()} - def compute_raw( + def _compute_raw( self, data_table, weights_table, policy, dtype=np.float32, is_csr=False ): - return super()._compute_raw( - data_table, - weights_table, - self._get_backend("basic_statistics", "compute", None), - policy, - dtype, - is_csr, - ) + module = self._get_backend("basic_statistics") + params = self._get_onedal_params(is_csr, dtype) + result = module.compute(policy, params, data_table, weights_table) + options = self._get_result_options(self.options).split("|") + + return {opt: getattr(result, opt) for opt in options} diff --git a/onedal/basic_statistics/incremental_basic_statistics.py b/onedal/basic_statistics/incremental_basic_statistics.py index eb77625628..cbc7019321 100644 --- a/onedal/basic_statistics/incremental_basic_statistics.py +++ b/onedal/basic_statistics/incremental_basic_statistics.py @@ -14,56 +14,12 @@ # limitations under the License. # ============================================================================== -from abc import ABCMeta, abstractmethod - import numpy as np from daal4py.sklearn._utils import get_dtype -from onedal import _backend -from ..common._policy import _get_policy from ..datatypes import _convert_to_supported, from_table, to_table - - -class BaseBasicStatistics(metaclass=ABCMeta): - @abstractmethod - def __init__(self, result_options, algorithm): - self.options = result_options - self.algorithm = algorithm - - @staticmethod - def get_all_result_options(): - return [ - "min", - "max", - "sum", - "mean", - "variance", - "variation", - "sum_squares", - "standard_deviation", - "sum_squares_centered", - "second_order_raw_moment", - ] - - def _get_policy(self, queue, *data): - return _get_policy(queue, *data) - - def _get_result_options(self, options): - if options == "all": - options = self.get_all_result_options() - if isinstance(options, list): - options = "|".join(options) - assert isinstance(options, str) - return options - - def _get_onedal_params(self, dtype=np.float32): - options = self._get_result_options(self.options) - return { - "fptype": "float" if dtype == np.float32 else "double", - "method": self.algorithm, - "result_option": options, - } +from .basic_statistics import BaseBasicStatistics class IncrementalBasicStatistics(BaseBasicStatistics): @@ -110,11 +66,11 @@ class IncrementalBasicStatistics(BaseBasicStatistics): def __init__(self, result_options="all"): super().__init__(result_options, algorithm="by_default") - module = _backend.basic_statistics.compute + module = self._get_backend("basic_statistics") self._partial_result = module.partial_compute_result() def _reset(self): - module = _backend.basic_statistics.compute + module = self._get_backend("basic_statistics") self._partial_result = module.partial_train_result() def partial_fit(self, X, weights=None, queue=None): @@ -146,7 +102,8 @@ def partial_fit(self, X, weights=None, queue=None): self._onedal_params = self._get_onedal_params(dtype) X_table, weights_table = to_table(X, weights) - self._partial_result = _backend.basic_statistics.compute.partial_compute( + module = self._get_backend("basic_statistics") + self._partial_result = module.partial_compute( self._policy, self._onedal_params, self._partial_result, @@ -169,7 +126,8 @@ def finalize_fit(self, queue=None): self : object Returns the instance itself. """ - result = _backend.basic_statistics.compute.finalize_compute( + module = self._get_backend("basic_statistics") + result = module.finalize_compute( self._policy, self._onedal_params, self._partial_result ) options = self._get_result_options(self.options).split("|") diff --git a/onedal/basic_statistics/tests/test_basic_statistics.py b/onedal/basic_statistics/tests/test_basic_statistics.py index ff373bb92c..f7e14bc1de 100644 --- a/onedal/basic_statistics/tests/test_basic_statistics.py +++ b/onedal/basic_statistics/tests/test_basic_statistics.py @@ -14,155 +14,301 @@ # limitations under the License. # ============================================================================== +import numpy as np +import pytest +from numpy.testing import assert_allclose from scipy import sparse as sp from daal4py.sklearn._utils import daal_check_version +from onedal.basic_statistics import BasicStatistics +from onedal.tests.utils._device_selection import get_queues -if daal_check_version((2023, "P", 100)): - import numpy as np - import pytest - from numpy.testing import assert_allclose - - from onedal.basic_statistics import BasicStatistics - from onedal.tests.utils._device_selection import get_queues - - options_and_tests = [ - ("sum", np.sum, (1e-5, 1e-7)), - ("min", np.min, (1e-5, 1e-7)), - ("max", np.max, (1e-5, 1e-7)), - ("mean", np.mean, (1e-5, 1e-7)), - ("standard_deviation", np.std, (3e-5, 3e-5)), - ] - - options_and_tests_csr = [ - ("sum", "sum", (5e-6, 1e-9)), - ("min", "min", (0, 0)), - # There is a bug in oneDAL's max computations on GPU - # ("max", "max", (0, 0)), - ("mean", "mean", (5e-6, 1e-9)), - ] - - @pytest.mark.parametrize("queue", get_queues()) - @pytest.mark.parametrize("dtype", [np.float32, np.float64]) - def test_basic_uniform(queue, dtype): - seed = 42 - s_count, f_count = 70000, 29 - - gen = np.random.default_rng(seed) - data = gen.uniform(low=-0.5, high=+0.6, size=(s_count, f_count)) - data = data.astype(dtype=dtype) - - alg = BasicStatistics(result_options="mean") - res = alg.compute(data, queue=queue) - - res_mean = res["mean"] - gtr_mean = np.mean(data, axis=0) - tol = 2e-5 if res_mean.dtype == np.float32 else 1e-7 - assert_allclose(gtr_mean, res_mean, rtol=tol) - - @pytest.mark.parametrize("queue", get_queues()) - @pytest.mark.parametrize("option", options_and_tests) - @pytest.mark.parametrize("dtype", [np.float32, np.float64]) - def test_option_uniform(queue, option, dtype): - seed = 77 - s_count, f_count = 19999, 31 - result_option, function, tols = option - fp32tol, fp64tol = tols +def expected_sum(X): + return np.sum(X, axis=0) - gen = np.random.default_rng(seed) - data = gen.uniform(low=-0.3, high=+0.7, size=(s_count, f_count)) - data = data.astype(dtype=dtype) - alg = BasicStatistics(result_options=result_option) - res = alg.compute(data, queue=queue) +def expected_max(X): + return np.max(X, axis=0) - res, gtr = res[result_option], function(data, axis=0) - tol = fp32tol if res.dtype == np.float32 else fp64tol - assert_allclose(gtr, res, rtol=tol) +def expected_min(X): + return np.min(X, axis=0) - @pytest.mark.parametrize("queue", get_queues()) - @pytest.mark.parametrize("option", options_and_tests) - @pytest.mark.parametrize("dtype", [np.float32, np.float64]) - def test_option_weighted(queue, option, dtype): - seed = 999 - s_count, f_count = 1024, 127 - result_option, function, tols = option - fp32tol, fp64tol = tols - fp32tol, fp64tol = 30 * fp32tol, 50 * fp64tol +def expected_mean(X): + return np.mean(X, axis=0) + + +def expected_standard_deviation(X): + return np.std(X, axis=0) + + +def expected_variance(X): + return np.var(X, axis=0) + + +def expected_variation(X): + return expected_standard_deviation(X) / expected_mean(X) + + +def expected_sum_squares(X): + return np.sum(np.square(X), axis=0) + + +def expected_sum_squares_centered(X): + return np.sum(np.square(X - expected_mean(X)), axis=0) - gen = np.random.default_rng(seed) - data = gen.uniform(low=-5.0, high=+9.0, size=(s_count, f_count)) - weights = gen.uniform(low=-0.5, high=+1.0, size=s_count) - data = data.astype(dtype=dtype) +def expected_standard_deviation(X): + return np.sqrt(expected_variance(X)) + + +def expected_second_order_raw_moment(X): + return np.mean(np.square(X), axis=0) + + +options_and_tests = [ + ("sum", expected_sum, (5e-4, 1e-7)), + ("min", expected_min, (1e-7, 1e-7)), + ("max", expected_max, (1e-7, 1e-7)), + ("mean", expected_mean, (5e-7, 1e-7)), + ("variance", expected_variance, (2e-3, 2e-3)), + ("variation", expected_variation, (5e-2, 5e-2)), + ("sum_squares", expected_sum_squares, (2e-4, 1e-7)), + ("sum_squares_centered", expected_sum_squares_centered, (2e-4, 1e-7)), + ("standard_deviation", expected_standard_deviation, (2e-3, 2e-3)), + ("second_order_raw_moment", expected_second_order_raw_moment, (1e-6, 1e-7)), +] + +options_and_tests_csr = [ + ("sum", "sum", (5e-6, 1e-9)), + ("min", "min", (0, 0)), + ("max", "max", (0, 0)), + ("mean", "mean", (5e-6, 1e-9)), +] + + +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("option", options_and_tests) +@pytest.mark.parametrize("row_count", [100, 1000]) +@pytest.mark.parametrize("column_count", [10, 100]) +@pytest.mark.parametrize("weighted", [True, False]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_single_option_on_random_data( + queue, option, row_count, column_count, weighted, dtype +): + result_option, function, tols = option + fp32tol, fp64tol = tols + seed = 77 + gen = np.random.default_rng(seed) + data = gen.uniform(low=-0.3, high=+0.7, size=(row_count, column_count)) + data = data.astype(dtype=dtype) + if weighted: + weights = gen.uniform(low=-0.5, high=+1.0, size=row_count) weights = weights.astype(dtype=dtype) + else: + weights = None - alg = BasicStatistics(result_options=result_option) - res = alg.compute(data, weights, queue=queue) + basicstat = BasicStatistics(result_options=result_option) - weighted = np.diag(weights) @ data - res, gtr = res[result_option], function(weighted, axis=0) + result = basicstat.fit(data, sample_weight=weights, queue=queue) - tol = fp32tol if res.dtype == np.float32 else fp64tol - assert_allclose(gtr, res, rtol=tol) - - @pytest.mark.skipif(not hasattr(sp, "random_array"), reason="requires scipy>=1.12.0") - @pytest.mark.parametrize("queue", get_queues()) - @pytest.mark.parametrize("dtype", [np.float32, np.float64]) - def test_basic_csr(queue, dtype): - seed = 42 - s_count, f_count = 5000, 3008 - - gen = np.random.default_rng(seed) - - data = sp.random_array( - shape=(s_count, f_count), - density=0.01, - format="csr", - dtype=dtype, - random_state=gen, - ) + res = getattr(result, result_option) + if weighted: + weighted_data = np.diag(weights) @ data + gtr = function(weighted_data) + else: + gtr = function(data) + + tol = fp32tol if res.dtype == np.float32 else fp64tol + assert_allclose(gtr, res, atol=tol) + + +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("row_count", [100, 1000]) +@pytest.mark.parametrize("column_count", [10, 100]) +@pytest.mark.parametrize("weighted", [True, False]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_multiple_options_on_random_data(queue, row_count, column_count, weighted, dtype): + seed = 42 + gen = np.random.default_rng(seed) + data = gen.uniform(low=-0.3, high=+0.7, size=(row_count, column_count)) + data = data.astype(dtype=dtype) + + if weighted: + weights = gen.uniform(low=-0.5, high=+1.0, size=row_count) + weights = weights.astype(dtype=dtype) + else: + weights = None + + basicstat = BasicStatistics(result_options=["mean", "max", "sum"]) - alg = BasicStatistics(result_options="mean") - res = alg.compute(data, queue=queue) - - res_mean = res["mean"] - gtr_mean = data.mean(axis=0) - tol = 5e-6 if res_mean.dtype == np.float32 else 1e-9 - assert_allclose(gtr_mean, res_mean, rtol=tol) - - @pytest.mark.skipif(not hasattr(sp, "random_array"), reason="requires scipy>=1.12.0") - @pytest.mark.parametrize("queue", get_queues()) - @pytest.mark.parametrize("option", options_and_tests_csr) - @pytest.mark.parametrize("dtype", [np.float32, np.float64]) - def test_options_csr(queue, option, dtype): - seed = 42 - s_count, f_count = 20046, 4007 - - gen = np.random.default_rng(seed) - - data = sp.random_array( - shape=(s_count, f_count), - density=0.002, - format="csr", - dtype=dtype, - random_state=gen, + result = basicstat.fit(data, sample_weight=weights, queue=queue) + + res_mean, res_max, res_sum = result.mean, result.max, result.sum + if weighted: + weighted_data = np.diag(weights) @ data + gtr_mean, gtr_max, gtr_sum = ( + expected_mean(weighted_data), + expected_max(weighted_data), + expected_sum(weighted_data), + ) + else: + gtr_mean, gtr_max, gtr_sum = ( + expected_mean(data), + expected_max(data), + expected_sum(data), ) + tol = 5e-4 if res_mean.dtype == np.float32 else 1e-7 + assert_allclose(gtr_mean, res_mean, atol=tol) + assert_allclose(gtr_max, res_max, atol=tol) + assert_allclose(gtr_sum, res_sum, atol=tol) + + +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("row_count", [100, 1000]) +@pytest.mark.parametrize("column_count", [10, 100]) +@pytest.mark.parametrize("weighted", [True, False]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_all_option_on_random_data(queue, row_count, column_count, weighted, dtype): + seed = 77 + gen = np.random.default_rng(seed) + data = gen.uniform(low=-0.3, high=+0.7, size=(row_count, column_count)) + data = data.astype(dtype=dtype) + if weighted: + weights = gen.uniform(low=-0.5, high=+1.0, size=row_count) + weights = weights.astype(dtype=dtype) + else: + weights = None + + basicstat = BasicStatistics(result_options="all") + + result = basicstat.fit(data, sample_weight=weights, queue=queue) + + if weighted: + weighted_data = np.diag(weights) @ data + + for option in options_and_tests: result_option, function, tols = option fp32tol, fp64tol = tols + res = getattr(result, result_option) + if weighted: + gtr = function(weighted_data) + else: + gtr = function(data) + tol = fp32tol if res.dtype == np.float32 else fp64tol + assert_allclose(gtr, res, atol=tol) + + +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("option", options_and_tests) +@pytest.mark.parametrize("data_size", [100, 1000]) +@pytest.mark.parametrize("weighted", [True, False]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_1d_input_on_random_data(queue, option, data_size, weighted, dtype): + result_option, function, tols = option + fp32tol, fp64tol = tols + seed = 77 + gen = np.random.default_rng(seed) + data = gen.uniform(low=-0.3, high=+0.7, size=data_size) + data = data.astype(dtype=dtype) + if weighted: + weights = gen.uniform(low=-0.5, high=+1.0, size=data_size) + weights = weights.astype(dtype=dtype) + else: + weights = None - alg = BasicStatistics(result_options=result_option) - res = alg.compute(data, queue=queue) + basicstat = BasicStatistics(result_options=result_option) - res = res[result_option] - func = getattr(data, function) - gtr = func(axis=0) - if type(gtr).__name__ != "ndarray": - gtr = gtr.toarray().flatten() - tol = fp32tol if res.dtype == np.float32 else fp64tol + result = basicstat.fit(data, sample_weight=weights, queue=queue) + + res = getattr(result, result_option) + if weighted: + weighted_data = weights * data + gtr = function(weighted_data) + else: + gtr = function(data) + + tol = fp32tol if res.dtype == np.float32 else fp64tol + assert_allclose(gtr, res, atol=tol) + + +@pytest.mark.skipif(not hasattr(sp, "random_array"), reason="requires scipy>=1.12.0") +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_basic_csr(queue, dtype): + seed = 42 + row_count, column_count = 5000, 3008 + + gen = np.random.default_rng(seed) + + data = sp.random_array( + shape=(row_count, column_count), + density=0.01, + format="csr", + dtype=dtype, + random_state=gen, + ) + + basicstat = BasicStatistics(result_options="mean") + result = basicstat.fit(data, queue=queue) + + res_mean = result.mean + gtr_mean = data.mean(axis=0) + tol = 5e-6 if res_mean.dtype == np.float32 else 1e-9 + assert_allclose(gtr_mean, res_mean, rtol=tol) + + +@pytest.mark.skipif(not hasattr(sp, "random_array"), reason="requires scipy>=1.12.0") +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("option", options_and_tests_csr) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_options_csr(queue, option, dtype): + result_option, function, tols = option + fp32tol, fp64tol = tols + + if result_option == "max": + pytest.skip("There is a bug in oneDAL's max computations on GPU") + + seed = 42 + row_count, column_count = 20046, 4007 + + gen = np.random.default_rng(seed) + + data = sp.random_array( + shape=(row_count, column_count), + density=0.002, + format="csr", + dtype=dtype, + random_state=gen, + ) + + basicstat = BasicStatistics(result_options=result_option) + result = basicstat.fit(data, queue=queue) + + res = getattr(result, result_option) + func = getattr(data, function) + gtr = func(axis=0) + if type(gtr).__name__ != "ndarray": + gtr = gtr.toarray().flatten() + tol = fp32tol if res.dtype == np.float32 else fp64tol + + assert_allclose(gtr, res, rtol=tol) + + +def test_warning(): + basicstat = BasicStatistics() + data = np.array([0, 1]) + + with pytest.warns( + UserWarning, + match="Method `compute` was deprecated in version 2024.7 and will be removed in 2025.0. Use `fit` instead.", + ) as warn_record: + basicstat.compute(data) - assert_allclose(gtr, res, rtol=tol) + if daal_check_version((2025, "P", 0)): + assert len(warn_record) == 0 + else: + assert len(warn_record) == 1 diff --git a/onedal/basic_statistics/tests/test_incremental_basic_statistics.py b/onedal/basic_statistics/tests/test_incremental_basic_statistics.py index ba46d5bbd5..2050630e49 100644 --- a/onedal/basic_statistics/tests/test_incremental_basic_statistics.py +++ b/onedal/basic_statistics/tests/test_incremental_basic_statistics.py @@ -19,67 +19,15 @@ from numpy.testing import assert_allclose from onedal.basic_statistics import IncrementalBasicStatistics +from onedal.basic_statistics.tests.test_basic_statistics import ( + expected_max, + expected_mean, + expected_sum, + options_and_tests, +) from onedal.tests.utils._device_selection import get_queues -def expected_sum(X): - return np.sum(X, axis=0) - - -def expected_max(X): - return np.max(X, axis=0) - - -def expected_min(X): - return np.min(X, axis=0) - - -def expected_mean(X): - return np.mean(X, axis=0) - - -def expected_standard_deviation(X): - return np.std(X, axis=0) - - -def expected_variance(X): - return np.var(X, axis=0) - - -def expected_variation(X): - return expected_standard_deviation(X) / expected_mean(X) - - -def expected_sum_squares(X): - return np.sum(np.square(X), axis=0) - - -def expected_sum_squares_centered(X): - return np.sum(np.square(X - expected_mean(X)), axis=0) - - -def expected_standard_deviation(X): - return np.sqrt(expected_variance(X)) - - -def expected_second_order_raw_moment(X): - return np.mean(np.square(X), axis=0) - - -options_and_tests = [ - ("sum", expected_sum, (3e-4, 1e-7)), - ("min", expected_min, (1e-7, 1e-7)), - ("max", expected_max, (1e-7, 1e-7)), - ("mean", expected_mean, (3e-7, 1e-7)), - ("variance", expected_variance, (2e-3, 2e-3)), - ("variation", expected_variation, (5e-2, 5e-2)), - ("sum_squares", expected_sum_squares, (2e-4, 1e-7)), - ("sum_squares_centered", expected_sum_squares_centered, (2e-4, 1e-7)), - ("standard_deviation", expected_standard_deviation, (2e-3, 2e-3)), - ("second_order_raw_moment", expected_second_order_raw_moment, (1e-6, 1e-7)), -] - - @pytest.mark.parametrize("queue", get_queues()) @pytest.mark.parametrize("weighted", [True, False]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 8def0d2234..6193f65a13 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -90,8 +90,7 @@ def _tolerance(self, rtol, X_table, policy, dtype=np.float32): is_sparse = False dummy = to_table(None) bs = self._get_basic_statistics_backend("variance") - - res = bs.compute_raw(X_table, dummy, policy, dtype, is_sparse) + res = bs._compute_raw(X_table, dummy, policy, dtype, is_sparse) mean_var = from_table(res["variance"]).mean() return mean_var * rtol diff --git a/onedal/spmd/basic_statistics/basic_statistics.py b/onedal/spmd/basic_statistics/basic_statistics.py index 27e37b1abc..8103c570b5 100644 --- a/onedal/spmd/basic_statistics/basic_statistics.py +++ b/onedal/spmd/basic_statistics/basic_statistics.py @@ -14,6 +14,8 @@ # limitations under the License. # ============================================================================== +import warnings + from onedal.basic_statistics import BasicStatistics as BasicStatistics_Batch from ..._device_offload import support_usm_ndarray @@ -24,3 +26,7 @@ class BasicStatistics(BaseEstimatorSPMD, BasicStatistics_Batch): @support_usm_ndarray() def compute(self, data, weights=None, queue=None): return super().compute(data, weights=weights, queue=queue) + + @support_usm_ndarray() + def fit(self, data, sample_weight=None, queue=None): + return super().fit(data, sample_weight=sample_weight, queue=queue) diff --git a/sklearnex/basic_statistics/basic_statistics.py b/sklearnex/basic_statistics/basic_statistics.py index 08be4eb6ff..b48b3c24ec 100644 --- a/sklearnex/basic_statistics/basic_statistics.py +++ b/sklearnex/basic_statistics/basic_statistics.py @@ -14,4 +14,119 @@ # limitations under the License. # ============================================================================== -from onedal.basic_statistics import BasicStatistics +import numpy as np +from sklearn.base import BaseEstimator + +from daal4py.sklearn._n_jobs_support import control_n_jobs +from onedal.basic_statistics import BasicStatistics as onedal_BasicStatistics + +from .._device_offload import dispatch +from .._utils import PatchingConditionsChain + + +@control_n_jobs(decorated_methods=["fit"]) +class BasicStatistics(BaseEstimator): + """ + Estimator for basic statistics. + Allows to compute basic statistics for provided data. + Parameters + ---------- + result_options: string or list, default='all' + List of statistics to compute + + Attributes (are existing only if corresponding result option exists) + ---------- + min : ndarray of shape (n_features,) + Minimum of each feature over all samples. + max : ndarray of shape (n_features,) + Maximum of each feature over all samples. + sum : ndarray of shape (n_features,) + Sum of each feature over all samples. + mean : ndarray of shape (n_features,) + Mean of each feature over all samples. + variance : ndarray of shape (n_features,) + Variance of each feature over all samples. + variation : ndarray of shape (n_features,) + Variation of each feature over all samples. + sum_squares : ndarray of shape (n_features,) + Sum of squares for each feature over all samples. + standard_deviation : ndarray of shape (n_features,) + Standard deviation of each feature over all samples. + sum_squares_centered : ndarray of shape (n_features,) + Centered sum of squares for each feature over all samples. + second_order_raw_moment : ndarray of shape (n_features,) + Second order moment of each feature over all samples. + """ + + def __init__(self, result_options="all"): + self.options = result_options + + _onedal_basic_statistics = staticmethod(onedal_BasicStatistics) + + def _save_attributes(self): + assert hasattr(self, "_onedal_estimator") + + if self.options == "all": + result_options = onedal_BasicStatistics.get_all_result_options() + else: + result_options = self.options + + if isinstance(result_options, str): + setattr(self, result_options, getattr(self._onedal_estimator, result_options)) + elif isinstance(result_options, list): + for option in result_options: + setattr(self, option, getattr(self._onedal_estimator, option)) + + def _onedal_supported(self, method_name, *data): + patching_status = PatchingConditionsChain( + f"sklearnex.basic_statistics.{self.__class__.__name__}.{method_name}" + ) + return patching_status + + _onedal_cpu_supported = _onedal_supported + _onedal_gpu_supported = _onedal_supported + + def _onedal_fit(self, X, sample_weight=None, queue=None): + onedal_params = { + "result_options": self.options, + } + + if not hasattr(self, "_onedal_estimator"): + self._onedal_estimator = self._onedal_basic_statistics(**onedal_params) + self._onedal_estimator.fit(X, sample_weight, queue) + self._save_attributes() + + def compute(self, data, weights=None, queue=None): + return self._onedal_estimator.compute(data, weights, queue) + + def fit(self, X, y=None, *, sample_weight=None): + """Compute statistics with X, using minibatches of size batch_size. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data for compute, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + Weights for compute weighted statistics, where `n_samples` is the number of samples. + + Returns + ------- + self : object + Returns the instance itself. + """ + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": None, + }, + X, + sample_weight, + ) + return self diff --git a/sklearnex/basic_statistics/tests/test_basic_statistics.py b/sklearnex/basic_statistics/tests/test_basic_statistics.py new file mode 100644 index 0000000000..8abbd6db1d --- /dev/null +++ b/sklearnex/basic_statistics/tests/test_basic_statistics.py @@ -0,0 +1,251 @@ +# ============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +from onedal.basic_statistics.tests.test_basic_statistics import ( + expected_max, + expected_mean, + expected_sum, + options_and_tests, +) +from onedal.tests.utils._dataframes_support import ( + _convert_to_dataframe, + get_dataframes_and_queues, +) +from sklearnex.basic_statistics import BasicStatistics + + +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +def test_sklearnex_import_basic_statistics(dataframe, queue): + X = np.array([[0, 0], [1, 1]]) + X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + + weights = np.array([1, 0.5]) + weights_df = _convert_to_dataframe(weights, sycl_queue=queue, target_df=dataframe) + + result = BasicStatistics().fit(X_df) + + expected_mean = np.array([0.5, 0.5]) + expected_min = np.array([0, 0]) + expected_max = np.array([1, 1]) + + assert_allclose(expected_mean, result.mean) + assert_allclose(expected_max, result.max) + assert_allclose(expected_min, result.min) + + result = BasicStatistics().fit(X_df, sample_weight=weights_df) + + expected_weighted_mean = np.array([0.25, 0.25]) + expected_weighted_min = np.array([0, 0]) + expected_weighted_max = np.array([0.5, 0.5]) + + assert_allclose(expected_weighted_mean, result.mean) + assert_allclose(expected_weighted_min, result.min) + assert_allclose(expected_weighted_max, result.max) + + +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +@pytest.mark.parametrize("weighted", [True, False]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_multiple_options_on_gold_data(dataframe, queue, weighted, dtype): + X = np.array([[0, 0], [1, 1]]) + X = X.astype(dtype=dtype) + X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + if weighted: + weights = np.array([1, 0.5]) + weights = weights.astype(dtype=dtype) + weights_df = _convert_to_dataframe(weights, sycl_queue=queue, target_df=dataframe) + basicstat = BasicStatistics() + + if weighted: + result = basicstat.fit(X_df, sample_weight=weights_df) + else: + result = basicstat.fit(X_df) + + if weighted: + expected_weighted_mean = np.array([0.25, 0.25]) + expected_weighted_min = np.array([0, 0]) + expected_weighted_max = np.array([0.5, 0.5]) + assert_allclose(expected_weighted_mean, result.mean) + assert_allclose(expected_weighted_max, result.max) + assert_allclose(expected_weighted_min, result.min) + else: + expected_mean = np.array([0.5, 0.5]) + expected_min = np.array([0, 0]) + expected_max = np.array([1, 1]) + assert_allclose(expected_mean, result.mean) + assert_allclose(expected_max, result.max) + assert_allclose(expected_min, result.min) + + +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +@pytest.mark.parametrize("option", options_and_tests) +@pytest.mark.parametrize("row_count", [100, 1000]) +@pytest.mark.parametrize("column_count", [10, 100]) +@pytest.mark.parametrize("weighted", [True, False]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_single_option_on_random_data( + dataframe, queue, option, row_count, column_count, weighted, dtype +): + result_option, function, tols = option + fp32tol, fp64tol = tols + seed = 77 + gen = np.random.default_rng(seed) + X = gen.uniform(low=-0.3, high=+0.7, size=(row_count, column_count)) + X = X.astype(dtype=dtype) + X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + if weighted: + weights = gen.uniform(low=-0.5, high=1.0, size=row_count) + weights = weights.astype(dtype=dtype) + weights_df = _convert_to_dataframe(weights, sycl_queue=queue, target_df=dataframe) + basicstat = BasicStatistics(result_options=result_option) + + if weighted: + result = basicstat.fit(X_df, sample_weight=weights_df) + else: + result = basicstat.fit(X_df) + + res = getattr(result, result_option) + if weighted: + weighted_data = np.diag(weights) @ X + gtr = function(weighted_data) + else: + gtr = function(X) + + tol = fp32tol if res.dtype == np.float32 else fp64tol + assert_allclose(gtr, res, atol=tol) + + +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +@pytest.mark.parametrize("row_count", [100, 1000]) +@pytest.mark.parametrize("column_count", [10, 100]) +@pytest.mark.parametrize("weighted", [True, False]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_multiple_options_on_random_data( + dataframe, queue, row_count, column_count, weighted, dtype +): + seed = 77 + gen = np.random.default_rng(seed) + X = gen.uniform(low=-0.3, high=+0.7, size=(row_count, column_count)) + X = X.astype(dtype=dtype) + X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + if weighted: + weights = gen.uniform(low=-0.5, high=1.0, size=row_count) + weights = weights.astype(dtype=dtype) + weights_df = _convert_to_dataframe(weights, sycl_queue=queue, target_df=dataframe) + basicstat = BasicStatistics(result_options=["mean", "max", "sum"]) + + if weighted: + result = basicstat.fit(X_df, sample_weight=weights_df) + else: + result = basicstat.fit(X_df) + + res_mean, res_max, res_sum = result.mean, result.max, result.sum + if weighted: + weighted_data = np.diag(weights) @ X + gtr_mean, gtr_max, gtr_sum = ( + expected_mean(weighted_data), + expected_max(weighted_data), + expected_sum(weighted_data), + ) + else: + gtr_mean, gtr_max, gtr_sum = ( + expected_mean(X), + expected_max(X), + expected_sum(X), + ) + + tol = 5e-4 if res_mean.dtype == np.float32 else 1e-7 + assert_allclose(gtr_mean, res_mean, atol=tol) + assert_allclose(gtr_max, res_max, atol=tol) + assert_allclose(gtr_sum, res_sum, atol=tol) + + +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +@pytest.mark.parametrize("row_count", [100, 1000]) +@pytest.mark.parametrize("column_count", [10, 100]) +@pytest.mark.parametrize("weighted", [True, False]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_all_option_on_random_data( + dataframe, queue, row_count, column_count, weighted, dtype +): + seed = 77 + gen = np.random.default_rng(seed) + X = gen.uniform(low=-0.3, high=+0.7, size=(row_count, column_count)) + X = X.astype(dtype=dtype) + X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + if weighted: + weights = gen.uniform(low=-0.5, high=+1.0, size=row_count) + weights = weights.astype(dtype=dtype) + weights_df = _convert_to_dataframe(weights, sycl_queue=queue, target_df=dataframe) + basicstat = BasicStatistics(result_options="all") + + if weighted: + result = basicstat.fit(X_df, sample_weight=weights_df) + else: + result = basicstat.fit(X_df) + + if weighted: + weighted_data = np.diag(weights) @ X + + for option in options_and_tests: + result_option, function, tols = option + fp32tol, fp64tol = tols + res = getattr(result, result_option) + if weighted: + gtr = function(weighted_data) + else: + gtr = function(X) + tol = fp32tol if res.dtype == np.float32 else fp64tol + assert_allclose(gtr, res, atol=tol) + + +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +@pytest.mark.parametrize("option", options_and_tests) +@pytest.mark.parametrize("data_size", [100, 1000]) +@pytest.mark.parametrize("weighted", [True, False]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_1d_input_on_random_data(dataframe, queue, option, data_size, weighted, dtype): + result_option, function, tols = option + fp32tol, fp64tol = tols + seed = 77 + gen = np.random.default_rng(seed) + X = gen.uniform(low=-0.3, high=+0.7, size=data_size) + X = X.astype(dtype=dtype) + X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + if weighted: + weights = gen.uniform(low=-0.5, high=1.0, size=data_size) + weights = weights.astype(dtype=dtype) + weights_df = _convert_to_dataframe(weights, sycl_queue=queue, target_df=dataframe) + basicstat = BasicStatistics(result_options=result_option) + + if weighted: + result = basicstat.fit(X_df, sample_weight=weights_df) + else: + result = basicstat.fit(X_df) + + res = getattr(result, result_option) + if weighted: + weighted_data = weights * X + gtr = function(weighted_data) + else: + gtr = function(X) + + tol = fp32tol if res.dtype == np.float32 else fp64tol + assert_allclose(gtr, res, atol=tol) diff --git a/sklearnex/basic_statistics/tests/test_incremental_basic_statistics.py b/sklearnex/basic_statistics/tests/test_incremental_basic_statistics.py index 2b2b42e76b..0931e4b524 100644 --- a/sklearnex/basic_statistics/tests/test_incremental_basic_statistics.py +++ b/sklearnex/basic_statistics/tests/test_incremental_basic_statistics.py @@ -18,7 +18,7 @@ import pytest from numpy.testing import assert_allclose -from onedal.basic_statistics.tests.test_incremental_basic_statistics import ( +from onedal.basic_statistics.tests.test_basic_statistics import ( expected_max, expected_mean, expected_sum, From 02c20738b341f47f0f038ac31c03bcce515573a1 Mon Sep 17 00:00:00 2001 From: Khalil Date: Tue, 9 Jul 2024 13:03:32 +0200 Subject: [PATCH 59/75] Fix: Multivariate Ridge Regression coefficients (#1898) --- daal4py/sklearn/linear_model/_ridge.py | 4 +- .../sklearn/linear_model/tests/test_ridge.py | 69 +++++++++++++++++++ deselected_tests.yaml | 5 -- sklearnex/linear_model/tests/test_linear.py | 16 +++++ 4 files changed, 88 insertions(+), 6 deletions(-) create mode 100644 daal4py/sklearn/linear_model/tests/test_ridge.py diff --git a/daal4py/sklearn/linear_model/_ridge.py b/daal4py/sklearn/linear_model/_ridge.py index 037f458407..ef985ced81 100644 --- a/daal4py/sklearn/linear_model/_ridge.py +++ b/daal4py/sklearn/linear_model/_ridge.py @@ -48,11 +48,12 @@ def _daal4py_fit(self, X, y_): ridge_params = np.asarray(self.alpha, dtype=X.dtype) if ridge_params.size != 1 and ridge_params.size != y.shape[1]: + # incorrect order of parameters in the error message is intentional to match sklearn raise ValueError( "Number of targets and number of penalties do not correspond: " f"{ridge_params.size} != {y.shape[1]}" ) - ridge_params = ridge_params.reshape((1, -1)) + ridge_params = ridge_params.reshape((-1, 1)) ridge_alg = daal4py.ridge_regression_training( fptype=_fptype, @@ -60,6 +61,7 @@ def _daal4py_fit(self, X, y_): interceptFlag=(self.fit_intercept is True), ridgeParameters=ridge_params, ) + try: ridge_res = ridge_alg.compute(X, y) except RuntimeError: diff --git a/daal4py/sklearn/linear_model/tests/test_ridge.py b/daal4py/sklearn/linear_model/tests/test_ridge.py new file mode 100644 index 0000000000..c1aea00f6c --- /dev/null +++ b/daal4py/sklearn/linear_model/tests/test_ridge.py @@ -0,0 +1,69 @@ +# ============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy +import pytest +from sklearn.datasets import make_regression + + +def _test_multivariate_ridge_coefficients(ridge_class, random_state): + X, y = make_regression( + n_samples=10, n_features=5, n_targets=3, random_state=random_state + ) + alpha = 3 + numpy.random.rand(3) * 5 + + # computing coefficients using daal4py Ridge + model = ridge_class(fit_intercept=False, alpha=alpha) + model.fit(X, y) + + # computing coefficients manually + n_features, n_targets = X.shape[1], y.shape[1] + betas = numpy.zeros((n_targets, n_features)) + + identity_matrix = numpy.eye(n_features) + + for j in range(n_targets): + y_j = y[:, j] + inverse_term = numpy.linalg.inv(numpy.dot(X.T, X) + alpha[j] * identity_matrix) + beta_j = numpy.dot(inverse_term, numpy.dot(X.T, y_j)) + betas[j, :] = beta_j + + # asserting that the coefficients are close + numpy.testing.assert_allclose(model.coef_, betas, rtol=1e-3, atol=1e-3) + + +def _test_multivariate_ridge_alpha_shape(ridge_class, random_state): + X, y = make_regression( + n_samples=10, n_features=5, n_targets=3, random_state=random_state + ) + wrong_shape_alpha = numpy.random.rand(5) + # asserting exception if alpha has wrong shape + with pytest.raises(ValueError): + ridge_class(alpha=wrong_shape_alpha).fit(X, y) + + +def test_multivariate_ridge_coefficients(): + from daal4py.sklearn.linear_model._ridge import Ridge + + random_state = 0 + _test_multivariate_ridge_coefficients(Ridge, random_state) + + +def test_multivariate_ridge_alpha_shape(): + from daal4py.sklearn.linear_model._ridge import Ridge + + random_state = 0 + _test_multivariate_ridge_alpha_shape(Ridge, random_state) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 344247a9fe..be59cf0fdd 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -214,11 +214,6 @@ deselected_tests: # sufficient accuracy (similar to previous cases) - linear_model/tests/test_coordinate_descent.py::test_enet_sample_weight_consistency >=0.23 - # The regression coefficients for multi-target problem differ from scikit-learn for small datasets. - # Coefficients matches for first label only and the coefficients are close for larger datasets. - # See: https://github.com/IntelPython/daal4py/issues/275 - - linear_model/tests/test_ridge.py::test_ridge_cv_individual_penalties >=0.24 - # Different interpretation of trees compared to scikit-learn # Looks like we need to align tree traversal. This problem will be fixed - ensemble/tests/test_forest.py::test_min_samples_leaf diff --git a/sklearnex/linear_model/tests/test_linear.py b/sklearnex/linear_model/tests/test_linear.py index b87a701960..81a71bd6de 100644 --- a/sklearnex/linear_model/tests/test_linear.py +++ b/sklearnex/linear_model/tests/test_linear.py @@ -20,6 +20,10 @@ from sklearn.datasets import make_regression from daal4py.sklearn._utils import daal_check_version +from daal4py.sklearn.linear_model.tests.test_ridge import ( + _test_multivariate_ridge_alpha_shape, + _test_multivariate_ridge_coefficients, +) from onedal.tests.utils._dataframes_support import ( _as_numpy, _convert_to_dataframe, @@ -124,3 +128,15 @@ def test_sklearnex_reconstruct_model(dataframe, queue, dtype): tol = 1e-5 if _as_numpy(y_pred).dtype == np.float32 else 1e-7 assert_allclose(gtr, _as_numpy(y_pred), rtol=tol) + + +def test_sklearnex_multivariate_ridge_coefs(): + from sklearnex.linear_model import Ridge + + _test_multivariate_ridge_coefficients(Ridge, random_state=0) + + +def test_sklearnex_multivariate_ridge_alpha_shape(): + from sklearnex.linear_model import Ridge + + _test_multivariate_ridge_alpha_shape(Ridge, random_state=0) From f42a09f02dacce15a7c3007bdd00f24bd3891ed0 Mon Sep 17 00:00:00 2001 From: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Date: Tue, 9 Jul 2024 08:32:23 -0700 Subject: [PATCH 60/75] FIX: align sklearnex `BasicStatistics._onedal_fit` with other algos (#1922) * FIX: align sklearnex stats _onedal_fit with other algos * formatting correction * ensure 2d false --- sklearnex/basic_statistics/basic_statistics.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sklearnex/basic_statistics/basic_statistics.py b/sklearnex/basic_statistics/basic_statistics.py index b48b3c24ec..c2aa3b428c 100644 --- a/sklearnex/basic_statistics/basic_statistics.py +++ b/sklearnex/basic_statistics/basic_statistics.py @@ -16,8 +16,11 @@ import numpy as np from sklearn.base import BaseEstimator +from sklearn.utils import check_array +from sklearn.utils.validation import _check_sample_weight from daal4py.sklearn._n_jobs_support import control_n_jobs +from daal4py.sklearn._utils import sklearn_check_version from onedal.basic_statistics import BasicStatistics as onedal_BasicStatistics from .._device_offload import dispatch @@ -87,6 +90,14 @@ def _onedal_supported(self, method_name, *data): _onedal_gpu_supported = _onedal_supported def _onedal_fit(self, X, sample_weight=None, queue=None): + if sklearn_check_version("1.0"): + X = self._validate_data(X, dtype=[np.float64, np.float32], ensure_2d=False) + else: + X = check_array(X, dtype=[np.float64, np.float32]) + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + onedal_params = { "result_options": self.options, } From fd4a889fa363674590b15c8e4c1299d95b32c077 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Tue, 9 Jul 2024 13:54:22 -0700 Subject: [PATCH 61/75] Update dependency zipp to v3.19.1 [SECURITY] (#1928) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- requirements-doc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-doc.txt b/requirements-doc.txt index a001cd9c5b..a8dae26814 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -71,5 +71,5 @@ typing-extensions==4.9.0 urllib3==2.2.2 wcwidth==0.2.13 webencodings==0.5.1 -zipp==3.17.0 +zipp==3.19.1 setuptools>=65.5.1 # not directly required, pinned by Snyk to avoid a vulnerability From e918ec368796f2f038eb9a9597b68daa2fc3fdee Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 9 Jul 2024 13:54:42 -0700 Subject: [PATCH 62/75] Bump zipp from 3.17.0 to 3.19.1 (#1927) Bumps [zipp](https://github.com/jaraco/zipp) from 3.17.0 to 3.19.1. - [Release notes](https://github.com/jaraco/zipp/releases) - [Changelog](https://github.com/jaraco/zipp/blob/main/NEWS.rst) - [Commits](https://github.com/jaraco/zipp/compare/v3.17.0...v3.19.1) --- updated-dependencies: - dependency-name: zipp dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> From a2f4f5d25736d7d48eb11a613fc44dd8b17f2962 Mon Sep 17 00:00:00 2001 From: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Date: Wed, 10 Jul 2024 10:17:03 -0700 Subject: [PATCH 63/75] CI: deselect failing NuSVC tests (#1929) --- deselected_tests.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index be59cf0fdd..f839e8c62d 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -153,6 +153,8 @@ deselected_tests: # scikit-learn expects an exception for sparse matrices with 64-bit integer indices, # scikit-learn-intelex works correctly with 64-bit integer indices - tests/test_common.py::test_estimators[NuSVC()-check_estimator_sparse_data] + - tests/test_common.py::test_estimators[NuSVC()-check_estimator_sparse_array] + - tests/test_common.py::test_estimators[NuSVC()-check_estimator_sparse_matrix] - utils/tests/test_estimator_checks.py::test_xfail_ignored_in_check_estimator # SVC._dual_coef_ is changing after fitting, but the result of prediction is still the same From f3d39a535b01adb7a546350914ef7ab2ea82446e Mon Sep 17 00:00:00 2001 From: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Date: Wed, 10 Jul 2024 21:31:08 -0700 Subject: [PATCH 64/75] CI: conda channel reconfiguration (#1932) * CI: miniconda to miniforge * CI: conda channel reconfiguration * additional intel cleanup * remove defaults * removed remove intel * miniconda * additional yml intel removals * trying with updated intel channel * forcing numpy<2 for dpnp compatibility * forcing numpy<2 for dpnp compatibility * trying without dpnp for 3.10 * remove dpctl on 3.10 as well --- .ci/pipeline/build-and-test-lnx.yml | 6 ++++-- .ci/pipeline/build-and-test-win.yml | 2 +- .ci/pipeline/nightly.yml | 2 +- .ci/pipeline/release.yml | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.ci/pipeline/build-and-test-lnx.yml b/.ci/pipeline/build-and-test-lnx.yml index 93651f2656..c503b942fc 100644 --- a/.ci/pipeline/build-and-test-lnx.yml +++ b/.ci/pipeline/build-and-test-lnx.yml @@ -23,8 +23,10 @@ steps: bash .ci/scripts/describe_system.sh displayName: "System info" - script: | + conda config --add channels conda-forge + conda config --set channel_priority strict conda update -y -q conda - conda create -q -y -n CB -c conda-forge python=$(PYTHON_VERSION) intel::dal-devel mpich pyyaml "dpcpp-cpp-rt=2024.2.0" + conda create -q -y -n CB -c conda-forge python=$(PYTHON_VERSION) dal-devel mpich pyyaml "dpcpp-cpp-rt=2024.2.0" displayName: "Conda create" - script: | . /usr/share/miniconda/etc/profile.d/conda.sh @@ -46,7 +48,7 @@ steps: bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION) pip install --upgrade -r requirements-test.txt pip install $(python .ci/scripts/get_compatible_scipy_version.py) - if [ $(echo $(PYTHON_VERSION) | grep '3.9\|3.10\|3.11') ] && [ $(SKLEARN_VERSION) != "1.0" ]; then conda install -q -y -c intel dpctl=0.17.0 dpnp=0.15.0; fi + if [ $(echo $(PYTHON_VERSION) | grep '3.9\|3.11') ] && [ $(SKLEARN_VERSION) != "1.0" ]; then conda install -q -y -c https://software.repos.intel.com/python/conda/ dpctl=0.17.0 dpnp=0.15.0; fi pip list displayName: "Install testing requirements" - script: | diff --git a/.ci/pipeline/build-and-test-win.yml b/.ci/pipeline/build-and-test-win.yml index 0d1ae6fd89..13b7986aaa 100644 --- a/.ci/pipeline/build-and-test-win.yml +++ b/.ci/pipeline/build-and-test-win.yml @@ -16,7 +16,7 @@ steps: - powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" displayName: Add conda to PATH - - script: conda create -q -y -n CB -c conda-forge -c intel python=$(PYTHON_VERSION) intel::dal-devel impi-devel clang-format pyyaml + - script: conda create -q -y -n CB -c conda-forge python=$(PYTHON_VERSION) dal-devel impi-devel clang-format pyyaml displayName: 'Create Anaconda environment' - script: | call activate CB diff --git a/.ci/pipeline/nightly.yml b/.ci/pipeline/nightly.yml index f425e14787..56a58eba78 100644 --- a/.ci/pipeline/nightly.yml +++ b/.ci/pipeline/nightly.yml @@ -64,7 +64,7 @@ jobs: conda config --append channels conda-forge conda config --remove channels defaults conda update -y -q conda - conda create -y -q -n CB -c intel -c conda-forge python=$(python.version) dal-devel impi-devel + conda create -y -q -n CB -c conda-forge python=$(python.version) dal-devel impi-devel displayName: 'Conda create' - script: | bash .ci/scripts/describe_system.sh diff --git a/.ci/pipeline/release.yml b/.ci/pipeline/release.yml index c4f611ddb8..f7557db732 100644 --- a/.ci/pipeline/release.yml +++ b/.ci/pipeline/release.yml @@ -45,7 +45,7 @@ jobs: displayName: 'Sklearn testing' - job: GeneratorConda steps: - - bash: python .ci/scripts/gen_release_jobs.py --channels intel conda-forge + - bash: python .ci/scripts/gen_release_jobs.py --channels conda-forge name: MatrixGen - job: ReleaseConda dependsOn: GeneratorConda From 04dbc5e1a412ac76f3578f76508df87e46487910 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 12 Jul 2024 01:10:13 +0200 Subject: [PATCH 65/75] [CI, testing] add SVM C and nu parameter check prevents seg fault sklearn < 1.2 (#1930) * Update svc.py * Update svr.py * Update nusvc.py * Update nusvc.py * Update nusvr.py * Update svc.py * match sklearn error messages * add comments * formatting * Update sklearnex/svm/svc.py Co-authored-by: ethanglaser <42726565+ethanglaser@users.noreply.github.com> * Update sklearnex/svm/nusvr.py Co-authored-by: ethanglaser <42726565+ethanglaser@users.noreply.github.com> --------- Co-authored-by: ethanglaser <42726565+ethanglaser@users.noreply.github.com> --- sklearnex/svm/nusvc.py | 11 +++++++++++ sklearnex/svm/nusvr.py | 11 +++++++++++ sklearnex/svm/svc.py | 11 +++++++++++ sklearnex/svm/svr.py | 11 +++++++++++ 4 files changed, 44 insertions(+) diff --git a/sklearnex/svm/nusvc.py b/sklearnex/svm/nusvc.py index d892dd598e..421546a203 100644 --- a/sklearnex/svm/nusvc.py +++ b/sklearnex/svm/nusvc.py @@ -83,6 +83,17 @@ def __init__( def fit(self, X, y, sample_weight=None): if sklearn_check_version("1.2"): self._validate_params() + elif self.nu <= 0 or self.nu > 1: + # else if added to correct issues with + # sklearn tests: + # svm/tests/test_sparse.py::test_error + # svm/tests/test_svm.py::test_bad_input + # for sklearn versions < 1.2 (i.e. without + # validate_params parameter checking) + # Without this, a segmentation fault with + # Windows fatal exception: access violation + # occurs + raise ValueError("nu <= 0 or nu > 1") if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) dispatch( diff --git a/sklearnex/svm/nusvr.py b/sklearnex/svm/nusvr.py index 5c2c1a1dee..2945175398 100644 --- a/sklearnex/svm/nusvr.py +++ b/sklearnex/svm/nusvr.py @@ -65,6 +65,17 @@ def __init__( def fit(self, X, y, sample_weight=None): if sklearn_check_version("1.2"): self._validate_params() + elif self.nu <= 0 or self.nu > 1: + # else if added to correct issues with + # sklearn tests: + # svm/tests/test_sparse.py::test_error + # svm/tests/test_svm.py::test_bad_input + # for sklearn versions < 1.2 (i.e. without + # validate_params parameter checking) + # Without this, a segmentation fault with + # Windows fatal exception: access violation + # occurs + raise ValueError("nu <= 0 or nu > 1") if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) dispatch( diff --git a/sklearnex/svm/svc.py b/sklearnex/svm/svc.py index b0e44a5bb1..337f44ba4b 100644 --- a/sklearnex/svm/svc.py +++ b/sklearnex/svm/svc.py @@ -85,6 +85,17 @@ def __init__( def fit(self, X, y, sample_weight=None): if sklearn_check_version("1.2"): self._validate_params() + elif self.C <= 0: + # else if added to correct issues with + # sklearn tests: + # svm/tests/test_sparse.py::test_error + # svm/tests/test_svm.py::test_bad_input + # for sklearn versions < 1.2 (i.e. without + # validate_params parameter checking) + # Without this, a segmentation fault with + # Windows fatal exception: access violation + # occurs + raise ValueError("C <= 0") if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) dispatch( diff --git a/sklearnex/svm/svr.py b/sklearnex/svm/svr.py index ed6c5baa23..1b16a5aa7e 100644 --- a/sklearnex/svm/svr.py +++ b/sklearnex/svm/svr.py @@ -65,6 +65,17 @@ def __init__( def fit(self, X, y, sample_weight=None): if sklearn_check_version("1.2"): self._validate_params() + elif self.C <= 0: + # else if added to correct issues with + # sklearn tests: + # svm/tests/test_sparse.py::test_error + # svm/tests/test_svm.py::test_bad_input + # for sklearn versions < 1.2 (i.e. without + # validate_params parameter checking) + # Without this, a segmentation fault with + # Windows fatal exception: access violation + # occurs + raise ValueError("C <= 0") if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) dispatch( From 7a0b5240ee471c9771f95826d7ac7f692c50c344 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 15 Jul 2024 09:09:37 +0200 Subject: [PATCH 66/75] [fix] remove `_test_type_and_finiteness` method from `LinearRegression` and `LogisticRegression` (#1923) * Update logistic_regression.py * Update linear.py * Update linear.py * Update logistic_regression.py * Update logistic_regression.py * Update linear.py * Update linear.py * Update logistic_regression.py * Update linear.py * Update logistic_regression.py * Update linear.py * Update logistic_regression.py --- sklearnex/linear_model/linear.py | 36 ++----------------- sklearnex/linear_model/logistic_regression.py | 32 +---------------- 2 files changed, 3 insertions(+), 65 deletions(-) diff --git a/sklearnex/linear_model/linear.py b/sklearnex/linear_model/linear.py index 83f93b40c9..aa31f105dd 100644 --- a/sklearnex/linear_model/linear.py +++ b/sklearnex/linear_model/linear.py @@ -27,7 +27,6 @@ from .._device_offload import dispatch, wrap_output_data from .._utils import PatchingConditionsChain, get_patch_message, register_hyperparameters -from ..utils.validation import _assert_all_finite if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): from sklearn.linear_model._base import _deprecate_normalize @@ -138,19 +137,6 @@ def score(self, X, y, sample_weight=None): sample_weight=sample_weight, ) - def _test_type_and_finiteness(self, X_in): - X = X_in if isinstance(X_in, np.ndarray) else np.asarray(X_in) - - dtype = X.dtype - if "complex" in str(type(dtype)): - return False - - try: - _assert_all_finite(X) - except BaseException: - return False - return True - def _onedal_fit_supported(self, method_name, *data): assert method_name == "fit" assert len(data) == 3 @@ -174,7 +160,7 @@ def _onedal_fit_supported(self, method_name, *data): # Check if equations are well defined is_underdetermined = n_samples < (n_features + int(self.fit_intercept)) - dal_ready = patching_status.and_conditions( + patching_status.and_conditions( [ (sample_weight is None, "Sample weight is not supported."), ( @@ -193,17 +179,6 @@ def _onedal_fit_supported(self, method_name, *data): ), ] ) - if not dal_ready: - return patching_status - - if not patching_status.and_condition( - self._test_type_and_finiteness(X), "Input X is not supported." - ): - return patching_status - - patching_status.and_condition( - self._test_type_and_finiteness(y), "Input y is not supported." - ) return patching_status @@ -217,19 +192,13 @@ def _onedal_predict_supported(self, method_name, *data): model_is_sparse = issparse(self.coef_) or ( self.fit_intercept and issparse(self.intercept_) ) - dal_ready = patching_status.and_conditions( + patching_status.and_conditions( [ (n_samples > 0, "Number of samples is less than 1."), (not issparse(data[0]), "Sparse input is not supported."), (not model_is_sparse, "Sparse coefficients are not supported."), ] ) - if not dal_ready: - return patching_status - - patching_status.and_condition( - self._test_type_and_finiteness(data[0]), "Input X is not supported." - ) return patching_status @@ -257,7 +226,6 @@ def _onedal_fit(self, X, y, sample_weight, queue=None): "accept_sparse": ["csr", "csc", "coo"], "y_numeric": True, "multi_output": True, - "force_all_finite": False, } if sklearn_check_version("1.2"): X, y = self._validate_data(**check_params) diff --git a/sklearnex/linear_model/logistic_regression.py b/sklearnex/linear_model/logistic_regression.py index 3f7a23bd5e..107a442213 100644 --- a/sklearnex/linear_model/logistic_regression.py +++ b/sklearnex/linear_model/logistic_regression.py @@ -38,7 +38,6 @@ from .._device_offload import dispatch, wrap_output_data from .._utils import PatchingConditionsChain, get_patch_message - from ..utils.validation import _assert_all_finite class BaseLogisticRegression(ABC): def _save_attributes(self): @@ -177,17 +176,6 @@ def _onedal_score(self, X, y, sample_weight=None, queue=None): y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight ) - def _test_type_and_finiteness(self, X_in): - X = np.asarray(X_in) - - if np.iscomplexobj(X): - return False - try: - _assert_all_finite(X) - except BaseException: - return False - return True - def _onedal_gpu_fit_supported(self, method_name, *data): assert method_name == "fit" assert len(data) == 3 @@ -203,7 +191,7 @@ def _onedal_gpu_fit_supported(self, method_name, *data): if sklearn_check_version("1.1") else type_of_target(y) ) - dal_ready = patching_status.and_conditions( + patching_status.and_conditions( [ (self.penalty == "l2", "Only l2 penalty is supported."), (self.dual == False, "dual=True is not supported."), @@ -227,18 +215,6 @@ def _onedal_gpu_fit_supported(self, method_name, *data): ] ) - if not dal_ready: - return patching_status - - if not patching_status.and_condition( - self._test_type_and_finiteness(X), "Input X is not supported." - ): - return patching_status - - patching_status.and_condition( - self._test_type_and_finiteness(y), "Input y is not supported." - ) - return patching_status def _onedal_gpu_predict_supported(self, method_name, *data): @@ -272,12 +248,6 @@ def _onedal_gpu_predict_supported(self, method_name, *data): ), ] ) - if not dal_ready: - return patching_status - - patching_status.and_condition( - self._test_type_and_finiteness(*data), "Input X is not supported." - ) return patching_status From bbec31aba525dc15606cac2b38af373c0cbadfc9 Mon Sep 17 00:00:00 2001 From: olegkkruglov <102592747+olegkkruglov@users.noreply.github.com> Date: Mon, 15 Jul 2024 04:55:29 -0700 Subject: [PATCH 67/75] Remove accidentally pushed comment from IncPCA example (#1937) --- examples/sklearnex/incremental_pca_dpctl.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/sklearnex/incremental_pca_dpctl.py b/examples/sklearnex/incremental_pca_dpctl.py index 1f76a583ec..0e176b139a 100644 --- a/examples/sklearnex/incremental_pca_dpctl.py +++ b/examples/sklearnex/incremental_pca_dpctl.py @@ -27,8 +27,6 @@ # We do partial_fit for each batch and then print final result. X_1 = dpt.asarray([[-1, -1], [-2, -1]], sycl_queue=queue) -# print(dir(X_1)) -# print(X_1.sycl_device) result = incpca.partial_fit(X_1) X_2 = dpt.asarray([[-3, -2], [1, 1]], sycl_queue=queue) From 4d28ee5c268d1bb3896eadbe560c2cd17e950728 Mon Sep 17 00:00:00 2001 From: msa <111298646+md-shafiul-alam@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:02:15 -0400 Subject: [PATCH 68/75] pca reselect test (#1934) --- deselected_tests.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index f839e8c62d..9db9b886da 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -753,9 +753,6 @@ gpu: - tests/test_common.py::test_estimators[ExtraTreesClassifier()-check_sample_weights_invariance(kind=zeros)] - tests/test_common.py::test_estimators[ExtraTreesRegressor()-check_sample_weights_invariance(kind=ones)] - # Single value incorrect in some CI runs with Linux and GPU hardware, requires further analysis - - tests/test_common.py::test_estimators[PCA()-check_methods_subset_invariance] - # RuntimeError: Device support is not implemented, failing as result of fallback to cpu false # NearestNeighbors - cluster/tests/test_dbscan.py From e7187e475a7c9027824469c04c3a966fd6134407 Mon Sep 17 00:00:00 2001 From: olegkkruglov <102592747+olegkkruglov@users.noreply.github.com> Date: Mon, 15 Jul 2024 09:30:01 -0700 Subject: [PATCH 69/75] ENH: Renamed layout check macro for np.array, changed default layout for oneDAL table (#1924) --- onedal/datatypes/data_conversion.cpp | 7 +++++-- onedal/datatypes/numpy_helpers.hpp | 3 ++- src/daal4py.cpp | 8 ++++---- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/onedal/datatypes/data_conversion.cpp b/onedal/datatypes/data_conversion.cpp index 7723c64232..ad866d832b 100644 --- a/onedal/datatypes/data_conversion.cpp +++ b/onedal/datatypes/data_conversion.cpp @@ -80,8 +80,11 @@ inline dal::homogen_table convert_to_homogen_impl(PyArrayObject *np_data) { // TODO: check safe cast from int to std::int64_t column_count = static_cast(array_size(np_data, 1)); } + // If both array_is_behaved_C(np_data) and array_is_behaved_F(np_data) are true + // (for example, if the array has only one column), then row-major layout will be chosen + // which is default on oneDAL side. const auto layout = - array_is_behaved_F(np_data) ? dal::data_layout::column_major : dal::data_layout::row_major; + array_is_behaved_C(np_data) ? dal::data_layout::row_major : dal::data_layout::column_major; auto res_table = dal::homogen_table(data_pointer, row_count, column_count, @@ -152,7 +155,7 @@ dal::table convert_to_table(PyObject *obj) { } if (is_array(obj)) { PyArrayObject *ary = reinterpret_cast(obj); - if (array_is_behaved(ary) || array_is_behaved_F(ary)) { + if (array_is_behaved_C(ary) || array_is_behaved_F(ary)) { #define MAKE_HOMOGEN_TABLE(CType) res = convert_to_homogen_impl(ary); SET_NPY_FEATURE(array_type(ary), array_type_sizeof(ary), diff --git a/onedal/datatypes/numpy_helpers.hpp b/onedal/datatypes/numpy_helpers.hpp index b43801259d..04ebeb13cd 100644 --- a/onedal/datatypes/numpy_helpers.hpp +++ b/onedal/datatypes/numpy_helpers.hpp @@ -119,7 +119,8 @@ #define is_array(a) ((a) && PyArray_Check(a)) #define array_type(a) PyArray_TYPE((PyArrayObject *)a) #define array_type_sizeof(a) PyArray_ITEMSIZE((PyArrayObject *)a) -#define array_is_behaved(a) (PyArray_ISCARRAY_RO((PyArrayObject *)a) && array_type(a) < NPY_OBJECT) +#define array_is_behaved_C(a) \ + (PyArray_ISCARRAY_RO((PyArrayObject *)a) && array_type(a) < NPY_OBJECT) #define array_is_behaved_F(a) \ (PyArray_ISFARRAY_RO((PyArrayObject *)a) && array_type(a) < NPY_OBJECT) #define array_is_native(a) (PyArray_ISNOTSWAPPED((PyArrayObject *)a)) diff --git a/src/daal4py.cpp b/src/daal4py.cpp index 61f0f61742..b92d5dece1 100755 --- a/src/daal4py.cpp +++ b/src/daal4py.cpp @@ -35,7 +35,7 @@ #define is_array(a) ((a) && PyArray_Check(a)) #define array_type(a) PyArray_TYPE((PyArrayObject *)a) -#define array_is_behaved(a) (PyArray_ISCARRAY_RO((PyArrayObject *)a) && array_type(a) < NPY_OBJECT) +#define array_is_behaved_C(a) (PyArray_ISCARRAY_RO((PyArrayObject *)a) && array_type(a) < NPY_OBJECT) #define array_is_behaved_F(a) (PyArray_ISFARRAY_RO((PyArrayObject *)a) && array_type(a) < NPY_OBJECT) #define array_is_native(a) (PyArray_ISNOTSWAPPED((PyArrayObject *)a)) #define array_numdims(a) PyArray_NDIM((PyArrayObject *)a) @@ -316,7 +316,7 @@ static daal::data_management::NumericTablePtr _make_hnt(PyObject * nda) daal::data_management::NumericTablePtr ptr; PyArrayObject * array = reinterpret_cast(nda); - assert(is_array(nda) && array_is_behaved(array)); + assert(is_array(nda) && array_is_behaved_C(array)); if (array_numdims(array) == 2) { @@ -415,7 +415,7 @@ daal::data_management::NumericTablePtr make_nt(PyObject * obj) { // we got a numpy array PyArrayObject * ary = reinterpret_cast(obj); - if (array_is_behaved(ary)) + if (array_is_behaved_C(ary)) { #define MAKENT_(_T) ptr = _make_hnt<_T>(obj) SET_NPY_FEATURE(PyArray_DESCR(ary)->type, MAKENT_, throw std::invalid_argument("Found unsupported array type")); @@ -492,7 +492,7 @@ daal::data_management::NumericTablePtr make_nt(PyObject * obj) throw std::runtime_error(std::string("Found wrong dimensionality (") + std::to_string(PyArray_NDIM(ary)) + ") of array in list when constructing SOA table (must be 1d)"); } - if (!array_is_behaved(ary)) + if (!array_is_behaved_C(ary)) { throw std::runtime_error(std::string("Cannot operate on column: ") + std::to_string(i) + " because it is non-contiguous. Please make it contiguous before passing it to daal4py\n"); } From 1ce26d4e253ef825880c739cf110c46936fcd27b Mon Sep 17 00:00:00 2001 From: olegkkruglov <102592747+olegkkruglov@users.noreply.github.com> Date: Tue, 16 Jul 2024 15:53:09 -0700 Subject: [PATCH 70/75] Remove exit code check to get meaningful logs (#1942) --- tests/test_examples_sklearnex.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_examples_sklearnex.py b/tests/test_examples_sklearnex.py index 9a5cdaf762..43ed6eef2b 100644 --- a/tests/test_examples_sklearnex.py +++ b/tests/test_examples_sklearnex.py @@ -46,12 +46,16 @@ def testit(self): ["python", os.path.join(examples_path, file)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, - check=True, + check=False, ) # nosec exit_code = process.returncode # Assert that the exit code is 0 - self.assertEqual(exit_code, 0) + self.assertEqual( + exit_code, + 0, + msg=f"Example has failed, the example's output:\n{process.stdout.decode()}\n{process.stderr.decode()}", + ) setattr(TestsklearnexExamples, "test_" + os.path.splitext(file)[0], testit) print("Generating tests for " + os.path.splitext(file)[0]) From 69d147fc986f14083741049d5ea02d7ac4d04eea Mon Sep 17 00:00:00 2001 From: olegkkruglov <102592747+olegkkruglov@users.noreply.github.com> Date: Tue, 16 Jul 2024 23:38:11 -0700 Subject: [PATCH 71/75] FIX: prevent `support_usm_ndarray` from changing queue if explicitly provided. (#1940) --- onedal/_device_offload.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/onedal/_device_offload.py b/onedal/_device_offload.py index c31979e35c..3e1aa37861 100644 --- a/onedal/_device_offload.py +++ b/onedal/_device_offload.py @@ -178,11 +178,27 @@ def _convert_to_dpnp(array): def support_usm_ndarray(freefunc=False, queue_param=True): + """ + Handles USMArray input. Puts SYCLQueue from data to decorated function arguments. + Converts output of decorated function to dpctl.tensor/dpnp.ndarray if input was of this type. + + Parameters + ---------- + freefunc (bool) : Set to True if decorates free function. + queue_param (bool) : Set to False if the decorated function has no `queue` parameter + + Notes + ----- + Queue will not be changed if provided explicitly. + """ + def decorator(func): def wrapper_impl(obj, *args, **kwargs): usm_iface = _extract_usm_iface(*args, **kwargs) data_queue, hostargs, hostkwargs = _get_host_inputs(*args, **kwargs) - if queue_param: + if queue_param and not ( + "queue" in hostkwargs and hostkwargs["queue"] is not None + ): hostkwargs["queue"] = data_queue result = _run_on_device(func, obj, *hostargs, **hostkwargs) if usm_iface is not None and hasattr(result, "__array_interface__"): From e0aefb2f7d2157ffafc5849e0ac1ebe34f4e10fe Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 17 Jul 2024 09:41:24 +0200 Subject: [PATCH 72/75] Update _forest.py (#1939) --- sklearnex/ensemble/_forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/ensemble/_forest.py b/sklearnex/ensemble/_forest.py index c5e4369715..0db236ff63 100644 --- a/sklearnex/ensemble/_forest.py +++ b/sklearnex/ensemble/_forest.py @@ -70,7 +70,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): X, y = self._validate_data( X, y, - multi_output=False, + multi_output=True, accept_sparse=False, dtype=[np.float64, np.float32], force_all_finite=False, From e3fb9ef54d691e29db54cbfbcc2b7303b526eed6 Mon Sep 17 00:00:00 2001 From: Khalil Date: Fri, 19 Jul 2024 20:29:08 +0200 Subject: [PATCH 73/75] ENH: Adding Ridge Regression support into `sklearnex.preview` (#1843) * onedal4py linear model now handles alpha parameter for ridge regression * added ridge to preview (early edition), dummy test to verify against stock sklearn, updated patch map * cpu execution now falls back to daal fit ridge and gpu to onedal, fixed onedal version checks * added preview ridge to sklearnex setup script, fixed patch mapping, adjusted preview patching test * code refactoring, added score method supporting onedal predict, expanded ridge init arg list * refactoring, sklearn input matching, and solver checker * added daal version check in preview ridge, updated tests with score method * fixed daal version checks in linear model and patching test * fix ridge test numpy for older python * score now checks for model being fitted, added tests Signed-off-by: Khalil Asadzade * bumped up onedal version checks * added tests for preview ridge, removed sklearn conformance * added onedal4py tests * minor refactoring * internalized is_numeric_scalar function * removed forced fallback to sklearn during fit * fix + refactor: onedal4py no intercept ridge test * separated Ridge to its own class in onedal4py, adjusted in sklearnex preview * refactor: numpy assert close in preview ridge test * replaced finiteness checker and extended tests, minor refactoring * increased precision requirement for diabetes tests in onedal4py regressions * removed complex numbers check from *_supported verification * removed unused _sparse attribute * changed numeric scalar check implementation, fixed copy_x usage --------- Signed-off-by: Khalil Asadzade --- onedal/linear_model/__init__.py | 9 +- onedal/linear_model/linear_model.cpp | 6 + onedal/linear_model/linear_model.py | 113 +++++- .../tests/test_linear_regression.py | 2 +- onedal/linear_model/tests/test_ridge.py | 95 +++++ setup_sklearnex.py | 1 + sklearnex/dispatcher.py | 10 + sklearnex/preview/__init__.py | 2 +- sklearnex/preview/linear_model/__init__.py | 19 + sklearnex/preview/linear_model/ridge.py | 373 ++++++++++++++++++ .../preview/linear_model/tests/test_ridge.py | 102 +++++ sklearnex/tests/test_monkeypatch.py | 16 +- 12 files changed, 735 insertions(+), 13 deletions(-) create mode 100644 onedal/linear_model/tests/test_ridge.py create mode 100644 sklearnex/preview/linear_model/__init__.py create mode 100644 sklearnex/preview/linear_model/ridge.py create mode 100644 sklearnex/preview/linear_model/tests/test_ridge.py diff --git a/onedal/linear_model/__init__.py b/onedal/linear_model/__init__.py index 5b95bd4088..998e4a62d7 100755 --- a/onedal/linear_model/__init__.py +++ b/onedal/linear_model/__init__.py @@ -15,7 +15,12 @@ # =============================================================================== from .incremental_linear_model import IncrementalLinearRegression -from .linear_model import LinearRegression +from .linear_model import LinearRegression, Ridge from .logistic_regression import LogisticRegression -__all__ = ["IncrementalLinearRegression", "LinearRegression", "LogisticRegression"] +__all__ = [ + "IncrementalLinearRegression", + "LinearRegression", + "LogisticRegression", + "Ridge", +] diff --git a/onedal/linear_model/linear_model.cpp b/onedal/linear_model/linear_model.cpp index 083cf9130c..b51dd69a8c 100644 --- a/onedal/linear_model/linear_model.cpp +++ b/onedal/linear_model/linear_model.cpp @@ -85,8 +85,14 @@ struct params2desc { const auto intercept = params["intercept"].cast(); +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240600 + const auto alpha = params["alpha"].cast(); + auto desc = linear_regression::descriptor(intercept, alpha) + .set_result_options(get_onedal_result_options(params)); +#else auto desc = linear_regression::descriptor(intercept) .set_result_options(get_onedal_result_options(params)); +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240600 return desc; } }; diff --git a/onedal/linear_model/linear_model.py b/onedal/linear_model/linear_model.py index 4091933efc..cde64cd5ed 100755 --- a/onedal/linear_model/linear_model.py +++ b/onedal/linear_model/linear_model.py @@ -19,7 +19,7 @@ import numpy as np -from daal4py.sklearn._utils import get_dtype, make2d +from daal4py.sklearn._utils import daal_check_version, get_dtype, make2d from ..common._base import BaseEstimator from ..common._estimator_checks import _check_is_fitted @@ -34,19 +34,24 @@ class BaseLinearRegression(BaseEstimator, metaclass=ABCMeta): """ @abstractmethod - def __init__(self, fit_intercept, copy_X, algorithm): + def __init__(self, fit_intercept, copy_X, algorithm, alpha=0.0): self.fit_intercept = fit_intercept - self.algorithm = algorithm + self.alpha = alpha self.copy_X = copy_X + self.algorithm = algorithm def _get_onedal_params(self, dtype=np.float32): intercept = "intercept|" if self.fit_intercept else "" - return { + params = { "fptype": "float" if dtype == np.float32 else "double", "method": self.algorithm, "intercept": self.fit_intercept, "result_option": (intercept + "coefficients"), } + if daal_check_version((2024, "P", 600)): + params["alpha"] = self.alpha + + return params def _create_model(self, policy): module = self._get_backend("linear_model", "regression") @@ -159,7 +164,12 @@ class LinearRegression(BaseLinearRegression): """ def __init__( - self, fit_intercept=True, copy_X=False, *, algorithm="norm_eq", **kwargs + self, + fit_intercept=True, + copy_X=False, + *, + algorithm="norm_eq", + **kwargs, ): super().__init__(fit_intercept=fit_intercept, copy_X=copy_X, algorithm=algorithm) @@ -224,3 +234,96 @@ def fit(self, X, y, queue=None): self.intercept_ = self.intercept_[0] return self + + +class Ridge(BaseLinearRegression): + """ + Ridge Regression oneDAL implementation. + + Parameters + ---------- + alpha : float, default=1.0 + Regularization strength; must be a positive float. Regularization + improves the conditioning of the problem and reduces the variance of + the estimates. Larger values specify stronger regularization. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to False, no intercept will be used in calculations + (i.e. data is expected to be centered). + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + algorithm : string, default="norm_eq" + Algorithm used for computation on oneDAL side. + """ + + def __init__( + self, + alpha=1.0, + fit_intercept=True, + copy_X=False, + *, + algorithm="norm_eq", + **kwargs, + ): + super().__init__( + fit_intercept=fit_intercept, alpha=alpha, copy_X=copy_X, algorithm=algorithm + ) + + def fit(self, X, y, queue=None): + """ + Fit linear model. + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. + + queue : dpctl.SyclQueue + If not None, use this queue for computations. + + Returns + ------- + self : object + Fitted Estimator. + """ + module = self._get_backend("linear_model", "regression") + + X = _check_array( + X, + dtype=[np.float64, np.float32], + force_all_finite=False, + ensure_2d=False, + copy=self.copy_X, + ) + + y = np.asarray(y).astype(dtype=get_dtype(X)) + + X, y = _check_X_y(X, y, force_all_finite=False, accept_2d_y=True) + + policy = self._get_policy(queue, X, y) + + self.n_features_in_ = _num_features(X, fallback_1d=True) + + X, y = _convert_to_supported(policy, X, y) + params = self._get_onedal_params(get_dtype(X)) + X_table, y_table = to_table(X, y) + + result = module.train(policy, params, X_table, y_table) + self._onedal_model = result.model + + packed_coefficients = from_table(result.model.packed_coefficients) + self.coef_, self.intercept_ = ( + packed_coefficients[:, 1:], + packed_coefficients[:, 0], + ) + + if self.coef_.shape[0] == 1 and y.ndim == 1: + self.coef_ = self.coef_.ravel() + self.intercept_ = self.intercept_[0] + + return self diff --git a/onedal/linear_model/tests/test_linear_regression.py b/onedal/linear_model/tests/test_linear_regression.py index e2dd7ce6d0..43c095d131 100755 --- a/onedal/linear_model/tests/test_linear_regression.py +++ b/onedal/linear_model/tests/test_linear_regression.py @@ -36,7 +36,7 @@ def test_diabetes(queue, dtype): model = LinearRegression(fit_intercept=True) model.fit(X_train, y_train, queue=queue) y_pred = model.predict(X_test, queue=queue) - assert mean_squared_error(y_test, y_pred) < 2396 + assert_allclose(mean_squared_error(y_test, y_pred), 2395.567, rtol=1e-5) @pytest.mark.parametrize("queue", get_queues()) diff --git a/onedal/linear_model/tests/test_ridge.py b/onedal/linear_model/tests/test_ridge.py new file mode 100644 index 0000000000..29b40260c0 --- /dev/null +++ b/onedal/linear_model/tests/test_ridge.py @@ -0,0 +1,95 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from daal4py.sklearn._utils import daal_check_version + +if daal_check_version((2024, "P", 600)): + import numpy as np + import pytest + from numpy.testing import assert_allclose, assert_array_equal + from sklearn.datasets import load_diabetes + from sklearn.metrics import mean_squared_error + from sklearn.model_selection import train_test_split + + from onedal.linear_model import Ridge + from onedal.tests.utils._device_selection import get_queues + + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + def test_diabetes(queue, dtype): + X, y = load_diabetes(return_X_y=True) + X, y = X.astype(dtype), y.astype(dtype) + X_train, X_test, y_train, y_test = train_test_split( + X, y, train_size=0.8, random_state=777 + ) + model = Ridge(fit_intercept=True, alpha=0.1) + model.fit(X_train, y_train, queue=queue) + y_pred = model.predict(X_test, queue=queue) + assert_allclose(mean_squared_error(y_test, y_pred), 2388.775, rtol=1e-5) + + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + def test_pickle(queue, dtype): + X, y = load_diabetes(return_X_y=True) + X, y = X.astype(dtype), y.astype(dtype) + model = Ridge(fit_intercept=True, alpha=0.5) + model.fit(X, y, queue=queue) + expected = model.predict(X, queue=queue) + + import pickle + + dump = pickle.dumps(model) + model2 = pickle.loads(dump) + + assert isinstance(model2, model.__class__) + result = model2.predict(X, queue=queue) + + assert_array_equal(expected, result) + + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + def test_no_intercept_results(queue, dtype): + seed = 42 + n_features, n_targets = 19, 7 + n_train_samples, n_test_samples = 3500, 1999 + + gen = np.random.default_rng(seed) + + X = gen.random(size=(n_train_samples, n_features), dtype=dtype) + y = gen.random(size=(n_train_samples, n_targets), dtype=dtype) + alpha = 0.5 + + lambda_identity = alpha * np.eye(X.shape[1]) + inverse_term = np.linalg.inv(np.dot(X.T, X) + lambda_identity) + xt_y = np.dot(X.T, y) + coef = np.dot(inverse_term, xt_y) + + model = Ridge(fit_intercept=False, alpha=alpha) + model.fit(X, y, queue=queue) + + if queue and queue.sycl_device.is_gpu: + tol = 5e-3 if model.coef_.dtype == np.float32 else 1e-5 + else: + tol = 2e-3 if model.coef_.dtype == np.float32 else 1e-5 + assert_allclose(coef, model.coef_.T, rtol=tol) + + Xt = gen.random(size=(n_test_samples, n_features), dtype=dtype) + gtr = Xt @ coef + + res = model.predict(Xt, queue=queue) + + tol = 2e-4 if res.dtype == np.float32 else 1e-7 + assert_allclose(gtr, res, rtol=tol) diff --git a/setup_sklearnex.py b/setup_sklearnex.py index 8d24519670..3a62a9ea1a 100755 --- a/setup_sklearnex.py +++ b/setup_sklearnex.py @@ -84,6 +84,7 @@ "sklearnex.preview.covariance", "sklearnex.preview.cluster", "sklearnex.preview.decomposition", + "sklearnex.preview.linear_model", "sklearnex.svm", "sklearnex.utils", ] diff --git a/sklearnex/dispatcher.py b/sklearnex/dispatcher.py index 8bac516041..60c56b4564 100644 --- a/sklearnex/dispatcher.py +++ b/sklearnex/dispatcher.py @@ -53,6 +53,7 @@ def get_patch_map_core(preview=False): EmpiricalCovariance as EmpiricalCovariance_sklearnex, ) from .preview.decomposition import IncrementalPCA as IncrementalPCA_sklearnex + from .preview.linear_model import Ridge as Ridge_sklearnex # Since the state of the lru_cache without preview cannot be # guaranteed to not have already enabled sklearnex algorithms @@ -90,6 +91,15 @@ def get_patch_map_core(preview=False): None, ] ] + + # Ridge + linear_model_module, _, _ = mapping["ridge"][0][0] + sklearn_obj = mapping["ridge"][0][1] + mapping.pop("ridge") + mapping["ridge"] = [ + [(linear_model_module, "Ridge", Ridge_sklearnex), sklearn_obj] + ] + return mapping from daal4py.sklearn.monkeypatch.dispatcher import _get_map_of_algorithms diff --git a/sklearnex/preview/__init__.py b/sklearnex/preview/__init__.py index 0d72651acf..f27da5fc4f 100644 --- a/sklearnex/preview/__init__.py +++ b/sklearnex/preview/__init__.py @@ -14,4 +14,4 @@ # limitations under the License. # ============================================================================== -__all__ = ["cluster", "covariance", "decomposition"] +__all__ = ["cluster", "covariance", "decomposition", "linear_model"] diff --git a/sklearnex/preview/linear_model/__init__.py b/sklearnex/preview/linear_model/__init__.py new file mode 100644 index 0000000000..70fa72fbc0 --- /dev/null +++ b/sklearnex/preview/linear_model/__init__.py @@ -0,0 +1,19 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from .ridge import Ridge + +__all__ = ["Ridge"] diff --git a/sklearnex/preview/linear_model/ridge.py b/sklearnex/preview/linear_model/ridge.py new file mode 100644 index 0000000000..d663055f87 --- /dev/null +++ b/sklearnex/preview/linear_model/ridge.py @@ -0,0 +1,373 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import logging + +from daal4py.sklearn._utils import daal_check_version, sklearn_check_version + +if daal_check_version((2024, "P", 600)): + import numbers + + import numpy as np + from scipy.sparse import issparse + from sklearn.linear_model import Ridge as sklearn_Ridge + from sklearn.metrics import r2_score + from sklearn.utils.validation import check_is_fitted, check_X_y + + from daal4py.sklearn.linear_model._ridge import _fit_ridge as daal4py_fit_ridge + + if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): + from sklearn.linear_model._base import _deprecate_normalize + + from onedal.linear_model import Ridge as onedal_Ridge + from onedal.utils import _num_features, _num_samples + + from ..._device_offload import dispatch, wrap_output_data + from ..._utils import PatchingConditionsChain + + def _is_numeric_scalar(value): + """ + Determines if the provided value is a numeric scalar. + + Args: + value: The value to be checked. + + Returns: + bool: True if the value is a numeric scalar, False otherwise. + """ + return isinstance(value, numbers.Real) + + class Ridge(sklearn_Ridge): + __doc__ = sklearn_Ridge.__doc__ + + if sklearn_check_version("1.2"): + _parameter_constraints: dict = {**sklearn_Ridge._parameter_constraints} + + def __init__( + self, + alpha=1.0, + fit_intercept=True, + copy_X=True, + max_iter=None, + tol=1e-4, + solver="auto", + positive=False, + random_state=None, + ): + super().__init__( + alpha=alpha, + fit_intercept=fit_intercept, + copy_X=copy_X, + max_iter=max_iter, + tol=tol, + solver=solver, + positive=positive, + random_state=random_state, + ) + + else: + + def __init__( + self, + alpha=1.0, + fit_intercept=True, + normalize="deprecated" if sklearn_check_version("1.0") else False, + copy_X=True, + max_iter=None, + tol=1e-4, + solver="auto", + positive=False, + random_state=None, + ): + super().__init__( + alpha=alpha, + fit_intercept=fit_intercept, + normalize=normalize, + copy_X=copy_X, + max_iter=max_iter, + solver=solver, + tol=tol, + positive=positive, + random_state=random_state, + ) + + def fit(self, X, y, sample_weight=None): + # It is necessary to properly update coefs for predict if we + # fallback to sklearn in dispatch + if hasattr(self, "_onedal_estimator"): + del self._onedal_estimator + + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_Ridge.fit, + }, + X, + y, + sample_weight, + ) + return self + + @wrap_output_data + def predict(self, X): + check_is_fitted(self) + + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_Ridge.predict, + }, + X, + ) + + @wrap_output_data + def score(self, X, y, sample_weight=None): + check_is_fitted(self) + + return dispatch( + self, + "score", + { + "onedal": self.__class__._onedal_score, + "sklearn": sklearn_Ridge.score, + }, + X, + y, + sample_weight=sample_weight, + ) + + def _onedal_fit_supported(self, patching_status, method_name, *data): + assert method_name == "fit" + assert len(data) == 3 + X, y, sample_weight = data + + normalize_is_set = ( + hasattr(self, "normalize") + and self.normalize + and self.normalize != "deprecated" + ) + positive_is_set = hasattr(self, "positive") and self.positive + + n_samples = _num_samples(X) + n_features = _num_features(X, fallback_1d=True) + + # Check if equations are well defined + is_underdetermined = n_samples < (n_features + int(self.fit_intercept)) + + patching_status.and_conditions( + [ + ( + self.solver == "auto", + f"'{self.solver}' solver is not supported. " + "Only 'auto' solver is supported.", + ), + ( + not issparse(X) and not issparse(y), + "Sparse input is not supported.", + ), + ( + not is_underdetermined, + "The shape of X (fitting) does not satisfy oneDAL requirements:" + "Number of features + 1 >= number of samples.", + ), + (sample_weight is None, "Sample weight is not supported."), + (not normalize_is_set, "Normalization is not supported."), + ( + not positive_is_set, + "Forced positive coefficients are not supported.", + ), + ] + ) + + return patching_status + + def _onedal_predict_supported(self, patching_status, method_name, *data): + assert method_name in ["predict", "score"] + assert len(data) <= 2 + + n_samples = _num_samples(data[0]) + model_is_sparse = issparse(self.coef_) or ( + self.fit_intercept and issparse(self.intercept_) + ) + patching_status.and_conditions( + [ + ( + self.solver == "auto", + f"'{self.solver}' solver is not supported. " + "Only 'auto' solver is supported.", + ), + (n_samples > 0, "Number of samples is less than 1."), + (not issparse(data[0]), "Sparse input is not supported."), + (not model_is_sparse, "Sparse coefficients are not supported."), + ] + ) + + return patching_status + + def _onedal_gpu_supported(self, method_name, *data): + patching_status = PatchingConditionsChain( + f"sklearn.linear_model.{self.__class__.__name__}.fit" + ) + + if method_name == "fit": + patching_status.and_condition( + _is_numeric_scalar(self.alpha), + "Non-scalar alpha is not supported for GPU.", + ) + + return self._onedal_fit_supported(patching_status, method_name, *data) + + if method_name in ["predict", "score"]: + return self._onedal_predict_supported(patching_status, method_name, *data) + + raise RuntimeError( + f"Unknown method {method_name} in {self.__class__.__name__}" + ) + + def _onedal_cpu_supported(self, method_name, *data): + patching_status = PatchingConditionsChain( + f"sklearn.linear_model.{self.__class__.__name__}.fit" + ) + + if method_name == "fit": + return self._onedal_fit_supported(patching_status, method_name, *data) + + if method_name in ["predict", "score"]: + return self._onedal_predict_supported(patching_status, method_name, *data) + + raise RuntimeError( + f"Unknown method {method_name} in {self.__class__.__name__}" + ) + + def _initialize_onedal_estimator(self): + onedal_params = { + "fit_intercept": self.fit_intercept, + "alpha": self.alpha, + "copy_X": self.copy_X, + } + self._onedal_estimator = onedal_Ridge(**onedal_params) + + def _daal_fit(self, X, y, sample_weight=None): + daal4py_fit_ridge(self, X, y, sample_weight) + self._onedal_estimator.n_features_in_ = _num_features(X, fallback_1d=True) + self._onedal_estimator.coef_ = self.coef_ + self._onedal_estimator.intercept_ = self.intercept_ + + def _onedal_fit(self, X, y, sample_weight, queue=None): + # `Sample weight` is not supported. Expected to be None value. + assert sample_weight is None + + check_params = { + "X": X, + "y": y, + "dtype": [np.float64, np.float32], + "accept_sparse": ["csr", "csc", "coo"], + "y_numeric": True, + "multi_output": True, + } + if sklearn_check_version("1.2"): + X, y = self._validate_data(**check_params) + self._validate_params() + else: + X, y = check_X_y(**check_params) + + if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): + self._normalize = _deprecate_normalize( + self.normalize, + default=False, + estimator_name=self.__class__.__name__, + ) + + self._initialize_onedal_estimator() + + # Falling back to daal4py if the device is CPU and alpha is array-like + # since onedal does not yet support non-scalars for alpha, thus + # should only be used for GPU/CPU with scalar alpha to not limit the functionality + cpu_device = queue is None or queue.sycl_device.is_cpu + if cpu_device and not _is_numeric_scalar(self.alpha): + self._daal_fit(X, y) + else: + self._onedal_estimator.fit(X, y, queue=queue) + + self._save_attributes() + + def _onedal_predict(self, X, queue=None): + if sklearn_check_version("1.0"): + X = self._validate_data(X, accept_sparse=False, reset=False) + + if not hasattr(self, "_onedal_estimator"): + self._initialize_onedal_estimator() + self._onedal_estimator.coef_ = self.coef_ + self._onedal_estimator.intercept_ = self.intercept_ + + res = self._onedal_estimator.predict(X, queue=queue) + return res + + def _onedal_score(self, X, y, sample_weight=None, queue=None): + return r2_score( + y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight + ) + + @property + def coef_(self): + return self._coef + + @coef_.setter + def coef_(self, value): + if hasattr(self, "_onedal_estimator"): + self._onedal_estimator.coef_ = value + # checking if the model is already fitted and if so, deleting the model + if hasattr(self._onedal_estimator, "_onedal_model"): + del self._onedal_estimator._onedal_model + self._coef = value + + @property + def intercept_(self): + return self._intercept + + @intercept_.setter + def intercept_(self, value): + if hasattr(self, "_onedal_estimator"): + self._onedal_estimator.intercept_ = value + # checking if the model is already fitted and if so, deleting the model + if hasattr(self._onedal_estimator, "_onedal_model"): + del self._onedal_estimator._onedal_model + self._intercept = value + + def _save_attributes(self): + self.n_features_in_ = self._onedal_estimator.n_features_in_ + self._coef = self._onedal_estimator.coef_ + self._intercept = self._onedal_estimator.intercept_ + + fit.__doc__ = sklearn_Ridge.fit.__doc__ + predict.__doc__ = sklearn_Ridge.predict.__doc__ + score.__doc__ = sklearn_Ridge.score.__doc__ + +else: + from daal4py.sklearn.linear_model._ridge import Ridge + from onedal._device_offload import support_usm_ndarray + + Ridge.fit = support_usm_ndarray(queue_param=False)(Ridge.fit) + Ridge.predict = support_usm_ndarray(queue_param=False)(Ridge.predict) + Ridge.score = support_usm_ndarray(queue_param=False)(Ridge.score) + + logging.warning( + "Preview Ridge requires oneDAL version >= 2024.6 but it was not found" + ) diff --git a/sklearnex/preview/linear_model/tests/test_ridge.py b/sklearnex/preview/linear_model/tests/test_ridge.py new file mode 100644 index 0000000000..b011699aaa --- /dev/null +++ b/sklearnex/preview/linear_model/tests/test_ridge.py @@ -0,0 +1,102 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import numpy +import pytest +from numpy.testing import assert_allclose +from sklearn.exceptions import NotFittedError + +from daal4py.sklearn._utils import daal_check_version +from onedal.tests.utils._dataframes_support import ( + _convert_to_dataframe, + get_dataframes_and_queues, +) + + +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +def test_sklearnex_import_ridge(dataframe, queue): + from sklearnex.preview.linear_model import Ridge + + X = numpy.array([[1, 1], [1, 2], [2, 2], [2, 3]]) + y = numpy.dot(X, numpy.array([1, 2])) + 3 + X_c = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + y_c = _convert_to_dataframe(y, sycl_queue=queue, target_df=dataframe) + ridge_reg = Ridge(alpha=0.5).fit(X_c, y_c) + + if daal_check_version((2024, "P", 600)): + assert "preview" in ridge_reg.__module__ + else: + assert "daal4py" in ridge_reg.__module__ + + assert_allclose(ridge_reg.intercept_, 3.86, rtol=1e-2) + assert_allclose(ridge_reg.coef_, [0.91, 1.64], rtol=1e-2) + + +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +@pytest.mark.parametrize("sample_size", [100, 1000]) +@pytest.mark.parametrize("feature_size", [10, 50]) +@pytest.mark.parametrize("alpha", [0.1, 0.5, 1.0]) +def test_ridge_coefficients(dataframe, queue, sample_size, feature_size, alpha): + from sklearnex.preview.linear_model import Ridge + + X = numpy.random.rand(sample_size, feature_size) + y = numpy.random.rand(sample_size) + X_c = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + y_c = _convert_to_dataframe(y, sycl_queue=queue, target_df=dataframe) + ridge_reg = Ridge(fit_intercept=False, alpha=alpha).fit(X_c, y_c) + + # computing the coefficients manually + # using the normal equation formula: (X^T * X + lambda * I)^-1 * X^T * y + lambda_identity = alpha * numpy.eye(X.shape[1]) + inverse_term = numpy.linalg.inv(numpy.dot(X.T, X) + lambda_identity) + xt_y = numpy.dot(X.T, y) + coefficients_manual = numpy.dot(inverse_term, xt_y) + + assert_allclose(ridge_reg.coef_, coefficients_manual) + + +if daal_check_version((2024, "P", 600)): + + @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) + def test_ridge_score_before_fit(dataframe, queue): + from sklearnex.preview.linear_model import Ridge + + sample_count, feature_count = 10, 5 + + model = Ridge(fit_intercept=True, alpha=0.5) + + X, y = numpy.random.rand(sample_count, feature_count), numpy.random.rand( + sample_count + ) + X_c = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + y_c = _convert_to_dataframe(y, sycl_queue=queue, target_df=dataframe) + + with pytest.raises(NotFittedError): + model.score(X_c, y_c) + + @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) + def test_ridge_predict_before_fit(dataframe, queue): + from sklearnex.preview.linear_model import Ridge + + sample_count, feature_count = 10, 5 + + model = Ridge(fit_intercept=True, alpha=0.5) + + X = numpy.random.rand(sample_count, feature_count) + X_c = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + + with pytest.raises(NotFittedError): + model.predict(X_c) diff --git a/sklearnex/tests/test_monkeypatch.py b/sklearnex/tests/test_monkeypatch.py index 52c8efa8f2..dc82a0365a 100755 --- a/sklearnex/tests/test_monkeypatch.py +++ b/sklearnex/tests/test_monkeypatch.py @@ -208,10 +208,11 @@ def get_estimators(): from sklearn.cluster import DBSCAN from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestClassifier - from sklearn.linear_model import LinearRegression + from sklearn.linear_model import LinearRegression, Ridge from sklearn.svm import SVC return ( + Ridge(), LinearRegression(), PCA(), DBSCAN(), @@ -226,9 +227,12 @@ def get_estimators(): assert _is_preview_enabled() - lr, pca, dbscan, svc, rfc = get_estimators() + ridge, lr, pca, dbscan, svc, rfc = get_estimators() assert "sklearnex" in rfc.__module__ + if daal_check_version((2024, "P", 600)): + assert "sklearnex.preview" in ridge.__module__ + if daal_check_version((2023, "P", 100)): assert "sklearnex" in lr.__module__ else: @@ -242,7 +246,8 @@ def get_estimators(): sklearnex.unpatch_sklearn() # no patching behavior - lr, pca, dbscan, svc, rfc = get_estimators() + ridge, lr, pca, dbscan, svc, rfc = get_estimators() + assert "sklearn." in ridge.__module__ and "daal4py" not in ridge.__module__ assert "sklearn." in lr.__module__ and "daal4py" not in lr.__module__ assert "sklearn." in pca.__module__ and "daal4py" not in pca.__module__ assert "sklearn." in dbscan.__module__ and "daal4py" not in dbscan.__module__ @@ -254,7 +259,10 @@ def get_estimators(): sklearnex.patch_sklearn() assert not _is_preview_enabled() - lr, pca, dbscan, svc, rfc = get_estimators() + ridge, lr, pca, dbscan, svc, rfc = get_estimators() + + assert "daal4py" in ridge.__module__ + if daal_check_version((2023, "P", 100)): assert "sklearnex" in lr.__module__ else: From 5f791029bb879261f0f3a0bbb6b397ec2c2b14b3 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 22 Jul 2024 12:06:00 -0700 Subject: [PATCH 74/75] Update dependency cmake to v3.30.1 (#1952) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- dependencies-dev | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependencies-dev b/dependencies-dev index f7bff516ac..e107ea460a 100644 --- a/dependencies-dev +++ b/dependencies-dev @@ -3,4 +3,4 @@ Jinja2==3.1.4 numpy==1.19.5 ; python_version < '3.9' numpy==2.0.0 ; python_version >= '3.9' pybind11==2.13.1 -cmake==3.30.0 +cmake==3.30.1 From ae8a398871e6608afc013ddcabc9c70da491ce91 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 22 Jul 2024 15:50:36 -0700 Subject: [PATCH 75/75] Update dependency pytest to v8.3.1 (#1950) * Update dependency pytest to v8.3.1 * Update requirements-test.txt --------- Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: ethanglaser <42726565+ethanglaser@users.noreply.github.com> --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 7676434e32..a677abbb6c 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,5 @@ pytest==7.4.4 ; python_version <= '3.9' -pytest==8.2.2 ; python_version >= '3.10' +pytest==8.3.1 ; python_version >= '3.10' numpy>=1.19.5 ; python_version <= '3.9' numpy>=1.21.6 ; python_version == '3.10' numpy>=1.23.5 ; python_version == '3.11'