Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEST: using get_dataframes_and_queues instead of get_queues in onedal4py testing #1909

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
ef7a284
TEST: using get_dataframes_and_queues instead of get_queues
samir-nasibli Jul 2, 2024
614581d
added docstrings to tests utilities
samir-nasibli Jun 28, 2024
1074829
minor update for onedal/tests/utils/_dataframes_support.py
samir-nasibli Jul 2, 2024
d0e1a89
update tests for BasicStatistics and covarience
samir-nasibli Jul 4, 2024
2249ce1
fix test_on_gold_data_unbiased
samir-nasibli Jul 4, 2024
7d9e517
Merge branch 'intel:main' into maint/onedal4py_testing
samir-nasibli Jul 4, 2024
b9847af
Merge branch 'intel:main' into maint/onedal4py_testing
samir-nasibli Jul 4, 2024
af1e5ac
update tests for onedal4py inc PCA
samir-nasibli Jul 5, 2024
e6e8f62
Merge branch 'intel:main' into maint/onedal4py_testing
samir-nasibli Jul 8, 2024
8925390
Merge branch 'main' into maint/onedal4py_testing
samir-nasibli Jul 12, 2024
d6dbaa4
Merge branch 'intel:main' into maint/onedal4py_testing
samir-nasibli Jul 26, 2024
acd1b0f
Merge branch 'intel:main' into maint/onedal4py_testing
samir-nasibli Aug 5, 2024
6f5aff6
Merge branch 'intel:main' into maint/onedal4py_testing
samir-nasibli Aug 8, 2024
1b1927f
Merge branch 'intel:main' into maint/onedal4py_testing
samir-nasibli Aug 11, 2024
be36a3f
Merge branch 'main' into maint/onedal4py_testing
samir-nasibli Sep 24, 2024
690f58c
Merge branch 'main' into maint/onedal4py_testing
samir-nasibli Dec 10, 2024
fb3c906
added comments for developers
samir-nasibli Dec 10, 2024
c1da9fc
codestyle
samir-nasibli Dec 11, 2024
ce55d35
fix test_incremental_pca
samir-nasibli Dec 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions onedal/covariance/tests/test_covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,33 +18,40 @@
import pytest
from numpy.testing import assert_allclose

from onedal.tests.utils._device_selection import get_queues
from onedal.tests.utils._dataframes_support import (
_as_numpy,
_convert_to_dataframe,
get_dataframes_and_queues,
)


@pytest.mark.parametrize("queue", get_queues())
def test_onedal_import_covariance(queue):
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues("numpy,np_sycl"))
def test_onedal_import_covariance(dataframe, queue):
from onedal.covariance import EmpiricalCovariance

X = np.array([[0, 1], [0, 1]])
X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe)
result = EmpiricalCovariance().fit(X, queue=queue)
expected_covariance = np.array([[0, 0], [0, 0]])
expected_means = np.array([0, 1])

assert_allclose(expected_covariance, result.covariance_)
assert_allclose(expected_means, result.location_)
assert_allclose(expected_covariance, _as_numpy(result.covariance_))
assert_allclose(expected_means, _as_numpy(result.location_))

X = np.array([[1, 2], [3, 6]])
X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe)
result = EmpiricalCovariance().fit(X, queue=queue)
expected_covariance = np.array([[2, 4], [4, 8]])
expected_means = np.array([2, 4])

assert_allclose(expected_covariance, result.covariance_)
assert_allclose(expected_means, result.location_)
assert_allclose(expected_covariance, _as_numpy(result.covariance_))
assert_allclose(expected_means, _as_numpy(result.location_))

X = np.array([[1, 2], [3, 6]])
X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe)
result = EmpiricalCovariance(bias=True).fit(X, queue=queue)
expected_covariance = np.array([[1, 2], [2, 4]])
expected_means = np.array([2, 4])

assert_allclose(expected_covariance, result.covariance_)
assert_allclose(expected_means, result.location_)
assert_allclose(expected_covariance, _as_numpy(result.covariance_))
assert_allclose(expected_means, _as_numpy(result.location_))
62 changes: 41 additions & 21 deletions onedal/covariance/tests/test_incremental_covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,17 @@
from numpy.testing import assert_allclose

from onedal.datatypes import from_table
from onedal.tests.utils._dataframes_support import (
_as_numpy,
_convert_to_dataframe,
get_dataframes_and_queues,
)
from onedal.tests.utils._device_selection import get_queues


@pytest.mark.parametrize("queue", get_queues())
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues("numpy,np_sycl"))
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_on_gold_data_unbiased(queue, dtype):
def test_on_gold_data_unbiased(dataframe, queue, dtype):
from onedal.covariance import IncrementalEmpiricalCovariance

X = np.array([[0, 1], [0, 1]])
Expand All @@ -33,34 +38,40 @@ def test_on_gold_data_unbiased(queue, dtype):
inccov = IncrementalEmpiricalCovariance()

for i in range(2):
inccov.partial_fit(X_split[i], queue=queue)
X_split_i = _convert_to_dataframe(
X_split[i], sycl_queue=queue, target_df=dataframe
)
inccov.partial_fit(X_split_i, queue=queue)
result = inccov.finalize_fit()

expected_covariance = np.array([[0, 0], [0, 0]])
expected_means = np.array([0, 1])

assert_allclose(expected_covariance, result.covariance_)
assert_allclose(expected_means, result.location_)
assert_allclose(expected_covariance, _as_numpy(result.covariance_))
assert_allclose(expected_means, _as_numpy(result.location_))

X = np.array([[1, 2], [3, 6]])
X = X.astype(dtype)
X_split = np.array_split(X, 2)
inccov = IncrementalEmpiricalCovariance()

for i in range(2):
inccov.partial_fit(X_split[i], queue=queue)
X_split_i = _convert_to_dataframe(
X_split[i], sycl_queue=queue, target_df=dataframe
)
inccov.partial_fit(X_split_i, queue=queue)
result = inccov.finalize_fit()

expected_covariance = np.array([[2, 4], [4, 8]])
expected_means = np.array([2, 4])

assert_allclose(expected_covariance, result.covariance_)
assert_allclose(expected_means, result.location_)
assert_allclose(expected_covariance, _as_numpy(result.covariance_))
assert_allclose(expected_means, _as_numpy(result.location_))


@pytest.mark.parametrize("queue", get_queues())
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues("numpy,np_sycl"))
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_on_gold_data_biased(queue, dtype):
def test_on_gold_data_biased(dataframe, queue, dtype):
from onedal.covariance import IncrementalEmpiricalCovariance

X = np.array([[0, 1], [0, 1]])
Expand All @@ -69,39 +80,45 @@ def test_on_gold_data_biased(queue, dtype):
inccov = IncrementalEmpiricalCovariance(bias=True)

for i in range(2):
inccov.partial_fit(X_split[i], queue=queue)
X_split_i = _convert_to_dataframe(
X_split[i], sycl_queue=queue, target_df=dataframe
)
inccov.partial_fit(X_split_i, queue=queue)
result = inccov.finalize_fit()

expected_covariance = np.array([[0, 0], [0, 0]])
expected_means = np.array([0, 1])

assert_allclose(expected_covariance, result.covariance_)
assert_allclose(expected_means, result.location_)
assert_allclose(expected_covariance, _as_numpy(result.covariance_))
assert_allclose(expected_means, _as_numpy(result.location_))

X = np.array([[1, 2], [3, 6]])
X = X.astype(dtype)
X_split = np.array_split(X, 2)
inccov = IncrementalEmpiricalCovariance(bias=True)

for i in range(2):
inccov.partial_fit(X_split[i], queue=queue)
X_split_i = _convert_to_dataframe(
X_split[i], sycl_queue=queue, target_df=dataframe
)
inccov.partial_fit(X_split_i, queue=queue)
result = inccov.finalize_fit()

expected_covariance = np.array([[1, 2], [2, 4]])
expected_means = np.array([2, 4])

assert_allclose(expected_covariance, result.covariance_)
assert_allclose(expected_means, result.location_)
assert_allclose(expected_covariance, _as_numpy(result.covariance_))
assert_allclose(expected_means, _as_numpy(result.location_))


@pytest.mark.parametrize("queue", get_queues())
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues("numpy,np_sycl"))
@pytest.mark.parametrize("num_batches", [2, 4, 6, 8, 10])
@pytest.mark.parametrize("row_count", [100, 1000, 2000])
@pytest.mark.parametrize("column_count", [10, 100, 200])
@pytest.mark.parametrize("bias", [True, False])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_partial_fit_on_random_data(
queue, num_batches, row_count, column_count, bias, dtype
dataframe, queue, num_batches, row_count, column_count, bias, dtype
):
from onedal.covariance import IncrementalEmpiricalCovariance

Expand All @@ -113,14 +130,17 @@ def test_partial_fit_on_random_data(
inccov = IncrementalEmpiricalCovariance(bias=bias)

for i in range(num_batches):
inccov.partial_fit(X_split[i], queue=queue)
X_split_i = _convert_to_dataframe(
X_split[i], sycl_queue=queue, target_df=dataframe
)
inccov.partial_fit(X_split_i, queue=queue)
result = inccov.finalize_fit()

expected_covariance = np.cov(X.T, bias=bias)
expected_means = np.mean(X, axis=0)

assert_allclose(expected_covariance, result.covariance_, atol=1e-6)
assert_allclose(expected_means, result.location_, atol=1e-6)
assert_allclose(expected_covariance, _as_numpy(result.covariance_), atol=1e-6)
assert_allclose(expected_means, _as_numpy(result.location_), atol=1e-6)


@pytest.mark.parametrize("queue", get_queues())
Expand Down
76 changes: 52 additions & 24 deletions onedal/decomposition/tests/test_incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,34 @@
from daal4py.sklearn._utils import daal_check_version
from onedal.datatypes import from_table
from onedal.decomposition import IncrementalPCA
from onedal.tests.utils._dataframes_support import (
_as_numpy,
_convert_to_dataframe,
get_dataframes_and_queues,
)
from onedal.tests.utils._device_selection import get_queues


@pytest.mark.parametrize("queue", get_queues())
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues("numpy,np_sycl"))
@pytest.mark.parametrize("is_deterministic", [True, False])
@pytest.mark.parametrize("whiten", [True, False])
@pytest.mark.parametrize("num_blocks", [1, 2, 3])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_on_gold_data(queue, is_deterministic, whiten, num_blocks, dtype):
def test_on_gold_data(dataframe, queue, is_deterministic, whiten, num_blocks, dtype):
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
X = X.astype(dtype=dtype)
X_split = np.array_split(X, num_blocks)
incpca = IncrementalPCA(is_deterministic=is_deterministic, whiten=whiten)

for i in range(num_blocks):
incpca.partial_fit(X_split[i], queue=queue)
X_split_i = _convert_to_dataframe(
X_split[i], sycl_queue=queue, target_df=dataframe
)
incpca.partial_fit(X_split_i, queue=queue)

result = incpca.finalize_fit()

X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe)
transformed_data = incpca.predict(X, queue=queue)

expected_n_components_ = 2
Expand Down Expand Up @@ -73,21 +82,28 @@ def test_on_gold_data(queue, is_deterministic, whiten, num_blocks, dtype):
)
)

transformed_data = _as_numpy(transformed_data)
tol = 1e-7
if transformed_data.dtype == np.float32:
tol = 7e-6 if whiten else 1e-6

assert result.n_components_ == expected_n_components_

assert_allclose(result.singular_values_, expected_singular_values_, atol=tol)
assert_allclose(result.mean_, expected_mean_, atol=tol)
assert_allclose(result.var_, expected_var_, atol=tol)
assert_allclose(result.explained_variance_, expected_explained_variance_, atol=tol)
assert_allclose(
result.explained_variance_ratio_, expected_explained_variance_ratio_, atol=tol
_as_numpy(result.singular_values_), expected_singular_values_, atol=tol
)
assert_allclose(_as_numpy(result.mean_), expected_mean_, atol=tol)
assert_allclose(_as_numpy(result.var_), expected_var_, atol=tol)
assert_allclose(
_as_numpy(result.explained_variance_), expected_explained_variance_, atol=tol
)
assert_allclose(
_as_numpy(result.explained_variance_ratio_),
expected_explained_variance_ratio_,
atol=tol,
)
if is_deterministic and daal_check_version((2024, "P", 500)):
assert_allclose(result.components_, expected_components_, atol=tol)
assert_allclose(_as_numpy(result.components_), expected_components_, atol=tol)
assert_allclose(transformed_data, expected_transformed_data, atol=tol)
else:
for i in range(result.n_components_):
Expand All @@ -106,41 +122,49 @@ def test_on_gold_data(queue, is_deterministic, whiten, num_blocks, dtype):
)


@pytest.mark.parametrize("queue", get_queues())
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues("numpy,np_sycl"))
@pytest.mark.parametrize("n_components", [None, 1, 5])
@pytest.mark.parametrize("whiten", [True, False])
@pytest.mark.parametrize("num_blocks", [1, 10])
@pytest.mark.parametrize("row_count", [100, 1000])
@pytest.mark.parametrize("column_count", [10, 100])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_on_random_data(
queue, n_components, whiten, num_blocks, row_count, column_count, dtype
dataframe, queue, n_components, whiten, num_blocks, row_count, column_count, dtype
):
seed = 78
gen = np.random.default_rng(seed)
X = gen.uniform(low=-0.3, high=+0.7, size=(row_count, column_count))
X = X.astype(dtype=dtype)
X_split = np.array_split(X, num_blocks)

expected_n_samples_seen = X.shape[0]
expected_n_features_in = X.shape[1]

incpca = IncrementalPCA(n_components=n_components, whiten=whiten)

for i in range(num_blocks):
incpca.partial_fit(X_split[i], queue=queue)
X_split_i = _convert_to_dataframe(
X_split[i], sycl_queue=queue, target_df=dataframe
)
incpca.partial_fit(X_split_i, queue=queue)

incpca.finalize_fit()

X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe)
transformed_data = incpca.predict(X, queue=queue)

transformed_data = _as_numpy(transformed_data)
tol = 3e-3 if transformed_data.dtype == np.float32 else 2e-6

n_components = incpca.n_components_
expected_n_samples_seen = X.shape[0]
expected_n_features_in = X.shape[1]
n_samples_seen = incpca.n_samples_seen_
n_features_in = incpca.n_features_in_
n_components = _as_numpy(incpca.n_components_)
n_samples_seen = _as_numpy(incpca.n_samples_seen_)
n_features_in = _as_numpy(incpca.n_features_in_)
assert n_samples_seen == expected_n_samples_seen
assert n_features_in == expected_n_features_in

components = incpca.components_
singular_values = incpca.singular_values_
components = _as_numpy(incpca.components_)
singular_values = _as_numpy(incpca.singular_values_)
centered_data = X - np.mean(X, axis=0)
cov_eigenvalues, cov_eigenvectors = np.linalg.eig(
centered_data.T @ centered_data / (n_samples_seen - 1)
Expand All @@ -163,19 +187,23 @@ def test_on_random_data(
assert np.abs(abs_dot_product - 1.0) < tol

expected_mean = np.mean(X, axis=0)
assert_allclose(incpca.mean_, expected_mean, atol=tol)
assert_allclose(_as_numpy(incpca.mean_), expected_mean, atol=tol)

expected_var_ = np.var(X, ddof=1, axis=0)
assert_allclose(incpca.var_, expected_var_, atol=tol)
assert_allclose(_as_numpy(incpca.var_), expected_var_, atol=tol)

expected_explained_variance = sorted_eigenvalues[:n_components]
assert_allclose(incpca.explained_variance_, expected_explained_variance, atol=tol)
assert_allclose(
_as_numpy(incpca.explained_variance_), expected_explained_variance, atol=tol
)

expected_explained_variance_ratio = expected_explained_variance / np.sum(
sorted_eigenvalues
)
assert_allclose(
incpca.explained_variance_ratio_, expected_explained_variance_ratio, atol=tol
_as_numpy(incpca.explained_variance_ratio_),
expected_explained_variance_ratio,
atol=tol,
)

expected_noise_variance = (
Expand All @@ -188,7 +216,7 @@ def test_on_random_data(

expected_transformed_data = centered_data @ components.T
if whiten:
scale = np.sqrt(incpca.explained_variance_)
scale = np.sqrt(_as_numpy(incpca.explained_variance_))
min_scale = np.finfo(scale.dtype).eps
scale[scale < min_scale] = np.inf
expected_transformed_data /= scale
Expand Down
8 changes: 7 additions & 1 deletion onedal/tests/utils/_dataframes_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def get_dataframes_and_queues(
dataframes_and_queues = []

if "numpy" in dataframe_filter_:
# sycl queue param is None.
dataframes_and_queues.append(pytest.param("numpy", None, id="numpy"))
if "pandas" in dataframe_filter_:
dataframes_and_queues.append(pytest.param("pandas", None, id="pandas"))
Expand All @@ -96,6 +97,11 @@ def get_df_and_q(dataframe: str):
df_and_q.append(pytest.param(dataframe, queue.values[0], id=id))
return df_and_q

if "np_sycl" in dataframe_filter_:
# sycl queue param is not None.
# Designed for interfaces that utilize NumPy inputs with a DPCTL queue,
# enabling offloading to specific SYCL devices.
dataframes_and_queues.extend(get_df_and_q("numpy_and_queue"))
Comment on lines +100 to +104
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@icfaust

  1. what do you think about the naming and this approach specifically? Does it make sense to leave only numpy_and_queue instead np_sycl?

  2. Maybe it is better to use param: numpy_with_queue=True

I mean:

get_dataframes_and_queues("numpy", numpy_and_queue=True))

instead of:

get_dataframes_and_queues("numpy,np_sycl"))

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the second option could be generalized for all dataframes that doesn't support sycl_queue, but when sycl_queue is provided

if dpctl_available and "dpctl" in dataframe_filter_:
dataframes_and_queues.extend(get_df_and_q("dpctl"))
if dpnp_available and "dpnp" in dataframe_filter_:
Expand Down Expand Up @@ -127,7 +133,7 @@ def _convert_to_dataframe(obj, sycl_queue=None, target_df=None, *args, **kwargs)
"""Converted input object to certain dataframe format."""
if target_df is None:
return obj
elif target_df == "numpy":
elif target_df in "numpy,numpy_and_queue":
# Numpy ndarray.
# `sycl_queue` arg is ignored.
return np.asarray(obj, *args, **kwargs)
Expand Down
Loading