diff --git a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp index a5585202c31..33f690d2110 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp @@ -51,9 +51,10 @@ TEMPLATE_LIST_TEST_M(lr_batch_test, "LR common flow", "[lr][batch]", lr_types) { } TEMPLATE_LIST_TEST_M(lr_batch_test, "LR with non-PSD matrix", "[lr][batch-nonpsd]", lr_types) { - SKIP_IF(this->non_psd_system_not_supported_on_device()); + SKIP_IF(this->not_float64_friendly()); this->generate(777); + this->run_and_check_linear_indefinite(); this->run_and_check_linear_indefinite_multioutput(); } diff --git a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp index e1e54092552..bfba2abf9e5 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp @@ -65,10 +65,6 @@ class lr_test : public te::crtp_algo_fixture { return static_cast(this); } - bool non_psd_system_not_supported_on_device() { - return this->get_policy().is_gpu(); - } - table compute_responses(const table& beta, const table& bias, const table& data) const { const auto s_count = data.get_row_count(); @@ -299,18 +295,20 @@ class lr_test : public te::crtp_algo_fixture { here is not positive-definite, thus it has an infinite number of possible solutions. The solution here is the one with minimum norm, which is typically more desirable. */ void run_and_check_linear_indefinite(double tol = 1e-3) { - const double X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, - 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, - 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; - const double y[] = { 0.13632112, 1.53203308, -0.65996941 }; + const auto dtype = + std::is_same::value ? data_type::float64 : data_type::float32; + const float_t X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, + 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, + 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; + const float_t y[] = { 0.13632112, 1.53203308, -0.65996941 }; auto X_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(3, 5) .copy_data(X, 3, 5) .build(); auto y_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(3, 1) .copy_data(y, 3, 1) @@ -321,20 +319,20 @@ class lr_test : public te::crtp_algo_fixture { const auto coefs = train_res.get_coefficients(); if (desc.get_result_options().test(result_options::intercept)) { - const double expected_beta[] = { 0.27785494, - 0.53011669, - 0.34352259, - 0.40506216, - -1.26026447 }; - const double expected_intercept[] = { 1.24485441 }; + const float_t expected_beta[] = { 0.27785494, + 0.53011669, + 0.34352259, + 0.40506216, + -1.26026447 }; + const float_t expected_intercept[] = { 1.24485441 }; const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(1, 5) .copy_data(expected_beta, 1, 5) .build(); const auto expected_intercept_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(1, 1) .copy_data(expected_intercept, 1, 1) @@ -351,13 +349,13 @@ class lr_test : public te::crtp_algo_fixture { } else { - const double expected_beta[] = { 0.38217445, - 0.2732197, - 1.87135517, - 0.63458468, - -2.08473134 }; + const float_t expected_beta[] = { 0.38217445, + 0.2732197, + 1.87135517, + 0.63458468, + -2.08473134 }; const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(1, 5) .copy_data(expected_beta, 1, 5) @@ -369,19 +367,21 @@ class lr_test : public te::crtp_algo_fixture { } void run_and_check_linear_indefinite_multioutput(double tol = 1e-3) { - const double X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, - 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, - 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; - const double y[] = { 0.13632112, 1.53203308, -0.65996941, - -0.31179486, 0.33776913, -2.2074711 }; + const auto dtype = + std::is_same::value ? data_type::float64 : data_type::float32; + const float_t X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, + 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, + 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; + const float_t y[] = { 0.13632112, 1.53203308, -0.65996941, + -0.31179486, 0.33776913, -2.2074711 }; auto X_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(3, 5) .copy_data(X, 3, 5) .build(); auto y_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(3, 2) .copy_data(y, 3, 2) @@ -392,19 +392,19 @@ class lr_test : public te::crtp_algo_fixture { const auto coefs = train_res.get_coefficients(); if (desc.get_result_options().test(result_options::intercept)) { - const double expected_beta[] = { + const float_t expected_beta[] = { -0.18692112, -0.20034801, -0.09590892, -0.13672683, 0.56229012, -0.97006008, 1.39413595, 0.49238012, 1.11041239, -0.79213452, }; - const double expected_intercept[] = { -0.48964358, 0.96467681 }; + const float_t expected_intercept[] = { -0.48964358, 0.96467681 }; const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(2, 5) .copy_data(expected_beta, 2, 5) .build(); const auto expected_intercept_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(1, 2) .copy_data(expected_intercept, 1, 2) @@ -421,11 +421,11 @@ class lr_test : public te::crtp_algo_fixture { } else { - const double expected_beta[] = { -0.22795353, -0.09930168, -0.69685744, -0.22700585, - 0.88658098, -0.88921961, 1.19505839, 1.67634561, - 1.2882766, -1.43103981 }; + const float_t expected_beta[] = { -0.22795353, -0.09930168, -0.69685744, -0.22700585, + 0.88658098, -0.88921961, 1.19505839, 1.67634561, + 1.2882766, -1.43103981 }; const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(2, 5) .copy_data(expected_beta, 2, 5) diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp index 223f6753f69..12eb654bbe4 100644 --- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp @@ -14,9 +14,14 @@ * limitations under the License. *******************************************************************************/ +#include + +#include "oneapi/dal/backend/primitives/blas/gemm.hpp" +#include "oneapi/dal/backend/primitives/blas/gemv.hpp" #include "oneapi/dal/backend/primitives/lapack.hpp" #include "oneapi/dal/backend/primitives/ndarray.hpp" #include "oneapi/dal/backend/primitives/ndindexer.hpp" +#include "oneapi/dal/detail/error_messages.hpp" namespace oneapi::dal::backend::primitives { @@ -59,6 +64,175 @@ inline sycl::event beta_copy_transform(sycl::queue& queue, }); } +/// This is an adaptation of the CPU version, which can be found in this file: +/// cpp/daal/src/algorithms/service_kernel_math.h +/// +/// It solves a linear system A*x = b +/// where 'b' might be a matrix (multiple RHS) +/// +/// It is intended as a fallback for solving linear regression, where these +/// matrices are obtained as follows: +/// A = t(X)*X +/// b = t(X)*y +/// +/// It can handle systems that are not positive semi-definite, so it's used +/// as a fallback when Cholesky fails or when it involves too small numbers +/// (which tends to happen when floating point error results in a slightly +/// positive matrix when it should have zero determinant in theory). +template +sycl::event solve_spectral_decomposition( + sycl::queue& queue, + ndview& A, /// t(X)*X, LHS, gets overwritten, dim=[dim_A, dim_A] (symmetric) + sycl::event& event_A, + ndview& + b, /// t(X)*y, RHS, solution is outputted here, dim=[dim_A, nrhs] in colmajor order + sycl::event& event_b, + const std::int64_t dim_A, + const std::int64_t nrhs) { + constexpr auto alloc = sycl::usm::alloc::device; + + /// Decompose: A = Q * diag(l) * t(Q) + /// Note: for NRHS>1, this will overallocate in order to reuse the memory as buffer later on + auto eigenvalues = ndarray::empty(queue, dim_A * nrhs, alloc); + sycl::event syevd_event = + syevd(queue, dim_A, A, dim_A, eigenvalues, { event_A }); + const Float eps = std::numeric_limits::epsilon(); + + /// Discard too small singular values + std::int64_t num_discarded; + { + /// This is placed inside a block because the array created here is not used any further + /// Note: the array for 'eigenvalues' is allocated with size [dimA, nrhs], + /// but by this point, only 'dimA' elements will have been written to it. + auto eigenvalues_cpu = + ndview::wrap(eigenvalues.get_data(), dim_A).to_host(queue, { syevd_event }); + const Float* eigenvalues_cpu_ptr = eigenvalues_cpu.get_data(); + const Float largest_ev = eigenvalues_cpu_ptr[dim_A - 1]; + if (largest_ev <= eps) { + throw internal_error{ dal::detail::error_messages::too_small_singular_values() }; + } + const Float component_threshold = eps * largest_ev; + for (num_discarded = 0; num_discarded < dim_A - 1; num_discarded++) { + if (eigenvalues_cpu_ptr[num_discarded] > component_threshold) + break; + } + } + + /// Create the square root of the inverse: Qis = Q * diag(1 / sqrt(l)) + std::int64_t num_taken = dim_A - num_discarded; + auto ev_mutable = eigenvalues.get_mutable_data(); + sycl::event inv_sqrt_eigenvalues_event = queue.submit([&](sycl::handler& h) { + h.depends_on(syevd_event); + h.parallel_for(num_taken, [=](const auto& i) { + const std::size_t ix = i + num_discarded; + ev_mutable[ix] = sycl::rsqrt(ev_mutable[ix]); + }); + }); + + auto Q_mutable = A.get_mutable_data(); + sycl::event inv_sqrt_eigenvectors_event = queue.submit([&](sycl::handler& h) { + const auto range = oneapi::dal::backend::make_range_2d(num_taken, dim_A); + h.depends_on(inv_sqrt_eigenvalues_event); + h.parallel_for(range, [=](sycl::id<2> id) { + const std::size_t col = id[0] + num_discarded; + const std::size_t row = id[1]; + Q_mutable[row + col * dim_A] *= ev_mutable[col]; + }); + }); + + /// Now calculate the actual solution: Qis * Qis' * B + const std::int64_t eigenvectors_offset = num_discarded * dim_A; + if (nrhs == 1) { + auto ev_mutable_vec_view = ndview::wrap(ev_mutable, num_taken); + auto b_mutable_vec_view = ndview::wrap(b.get_mutable_data(), dim_A); + sycl::event gemv_right_event = + gemv(queue, + ndview::wrap(Q_mutable + eigenvectors_offset, + ndshape<2>(num_taken, dim_A)), + b_mutable_vec_view, + ev_mutable_vec_view, + Float(1), + Float(0), + { inv_sqrt_eigenvectors_event, event_b }); + return gemv(queue, + ndview::wrap(Q_mutable + eigenvectors_offset, + ndshape<2>(dim_A, num_taken)), + ev_mutable_vec_view, + b_mutable_vec_view, + Float(1), + Float(0), + { gemv_right_event }); + } + + else { + auto ev_mutable_mat_view = + ndview::wrap(ev_mutable, ndshape<2>(nrhs, num_taken)); + auto b_mutable_mat_view = + ndview::wrap(b.get_mutable_data(), ndshape<2>(nrhs, dim_A)); + sycl::event gemm_right_event = + gemm(queue, + b_mutable_mat_view, + ndview::wrap(Q_mutable + eigenvectors_offset, + ndshape<2>(dim_A, num_taken)), + ev_mutable_mat_view, + Float(1), + Float(0), + { inv_sqrt_eigenvectors_event, event_b }); + return gemm(queue, + ev_mutable_mat_view, + ndview::wrap(Q_mutable + eigenvectors_offset, + ndshape<2>(num_taken, dim_A)), + b_mutable_mat_view, + Float(1), + Float(0), + { gemm_right_event }); + } +} + +/// Returns the minimum value among entries in the diagonal of a square matrix +template +Float diagonal_minimum(sycl::queue& queue, + const Float* Matrix, + const std::int64_t dim_matrix, + sycl::event& event_Matrix) { + constexpr auto alloc = sycl::usm::alloc::device; + auto diag_min_holder = array::empty(queue, 1, alloc); + sycl::event diag_min_event = queue.submit([&](sycl::handler& h) { + auto min_reduction = sycl::reduction(diag_min_holder.get_mutable_data(), + sycl::minimum<>(), + sycl::property::reduction::initialize_to_identity()); + h.depends_on({ event_Matrix }); + h.parallel_for(dim_matrix, min_reduction, [=](const auto& i, auto& min_obj) { + min_obj.combine(Matrix[i * (dim_matrix + 1)]); + }); + }); + return ndview::wrap(diag_min_holder).at_device(queue, 0, { diag_min_event }); +} + +template +sycl::event solve_with_fallback( + sycl::queue& queue, + const ndview& xtx, + const ndview& xty, + ndview& nxtx, + ndview& nxty, /// solution will be written here + const event_vector& dependencies) { + const std::int64_t dim_xtx = xtx.get_dimension(0); + const std::int64_t nrhs = nxty.get_dimension(0); + /// Note: this templated version of 'copy' reuses a layout that should have been + /// specified in a previous copy before the fallback. + sycl::event xtx_event_new = copy(queue, nxtx, xtx, dependencies); + sycl::event xty_event_new = copy(queue, nxty, xty, dependencies); + + return solve_spectral_decomposition(queue, + nxtx, + xtx_event_new, + nxty, + xty_event_new, + dim_xtx, + nrhs); +} + template sycl::event solve_system(sycl::queue& queue, const ndview& xtx, @@ -70,12 +244,38 @@ sycl::event solve_system(sycl::queue& queue, auto [nxty, xty_event] = copy(queue, xty, dependencies); auto [nxtx, xtx_event] = copy(queue, xtx, dependencies); + const std::int64_t dim_xtx = xtx.get_dimension(0); + + sycl::event solution_event; opt_array dummy{}; - auto potrf_event = potrf_factorization(queue, nxtx, dummy, { xtx_event }); - auto potrs_event = potrs_solution(queue, nxtx, nxty, dummy, { potrf_event, xty_event }); + /// Note: this is wrapped in a try-catch block in order to catch a potential exception + /// thrown by oneMKL when the Cholesky factorization ('potrs') fails due to the matrix + /// not being positive-definite. In such case, it will then fall back to spectral + /// decomposition-based inversion, which is able to handle such problems. + try { + sycl::event potrf_event = potrf_factorization(queue, nxtx, dummy, { xtx_event }); + queue.wait_and_throw(); + const Float diag_min = diagonal_minimum(queue, nxtx.get_data(), dim_xtx, potrf_event); + /// Note: this threshold was chosen as a guess for when Cholesky factorization might + /// succeed despite the matrix in theory not being positive-definite, or succeed but + /// having too large numerical inaccuracies. In such cases, there should be too small + /// singular values that the fallback will discard, but there's no guaranteed match + /// between singular values and entries in the Cholesky diagonal. This is just a guess. + const Float threshold_diagonal_min = 1e-6; + if (diag_min > threshold_diagonal_min) { + solution_event = + potrs_solution(queue, nxtx, nxty, dummy, { potrf_event, xty_event }); + } + else { + solution_event = solve_with_fallback(queue, xtx, xty, nxtx, nxty, dependencies); + } + } + catch (mkl::lapack::computation_error& ex) { + solution_event = solve_with_fallback(queue, xtx, xty, nxtx, nxty, dependencies); + } - return beta_copy_transform(queue, nxty, final_xty, { potrs_event }); + return beta_copy_transform(queue, nxty, final_xty, { solution_event }); } #define INSTANTIATE(U, B, F, XL, YL) \ diff --git a/cpp/oneapi/dal/detail/error_messages.cpp b/cpp/oneapi/dal/detail/error_messages.cpp index 20ce68e0ef2..85cbb969099 100644 --- a/cpp/oneapi/dal/detail/error_messages.cpp +++ b/cpp/oneapi/dal/detail/error_messages.cpp @@ -303,6 +303,8 @@ MSG(input_y_is_empty, "Input y is empty") /* Linear Regression */ MSG(intercept_result_option_requires_intercept_flag, "Intercept result option requires intercept flag") +MSG(too_small_singular_values, + "Linear system has too small singular values, cannot calculate a solution") /* Logistic Regression */ MSG(class_count_neq_two, diff --git a/cpp/oneapi/dal/detail/error_messages.hpp b/cpp/oneapi/dal/detail/error_messages.hpp index dbb9c8ba6ec..926e89321b0 100644 --- a/cpp/oneapi/dal/detail/error_messages.hpp +++ b/cpp/oneapi/dal/detail/error_messages.hpp @@ -242,6 +242,7 @@ class ONEDAL_EXPORT error_messages { /* Linear Regression */ MSG(intercept_result_option_requires_intercept_flag); + MSG(too_small_singular_values); /* Logistic Regression */ MSG(class_count_neq_two);