From 4a558770405381d96d806abb860e2b976239b6dc Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 6 Nov 2024 17:06:03 +0100 Subject: [PATCH 001/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - introduce new function __find_start_point_in Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 151 +++++++++++++++--- 1 file changed, 127 insertions(+), 24 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 753e32816a0..ea0c03d3365 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -45,39 +45,142 @@ namespace __par_backend_hetero // | ----> // 3 | 0 0 0 0 0 | template -auto -__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, - const _Index __n2, _Compare __comp) +std::pair<_Index, _Index> +__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, + const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] - oneapi::dpl::counting_iterator<_Index> __diag_it(0); + assert(__rng1_from <= __rng1_to); + assert(__rng2_from <= __rng2_to); + + assert(__rng1_to > 0 || __rng2_to > 0); - if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed + if constexpr (!std::is_pointer_v<_Rng1>) + assert(__rng1_to <= __rng1.size()); + if constexpr (!std::is_pointer_v<_Rng2>) + assert(__rng2_to <= __rng2.size()); + + assert(__i_elem >= 0); + + // ----------------------- EXAMPLE ------------------------ + // Let's consider the following input data: + // rng1.size() = 10 + // rng2.size() = 6 + // i_diag = 9 + // Let's define the following ranges for processing: + // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 + // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 + // + // The goal: required to process only X' items of the merge matrix + // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) + // + // -------------------------------------------------------- + // + // __diag_it_begin(rng1) __diag_it_end(rng1) + // (init state) (dest state) (init state, dest state) + // | | | + // V V V + // + + + + + + + // \ rng1 0 1 2 3 4 5 6 7 8 9 + // rng2 +--------------------------------------+ + // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) + // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) + // + 2 | <----------------- + X'1 | | + // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) + // 4 | X ^ | | + // 5 | X | | | <--- __diag_it_begin(rng2) (init state) + // +-------AX-----------+-----------+-----+ + // AX | | + // AX | | + // Run lower_bound:[from = 5, to = 8) + // + // AX - absent items in rng2 + // + // We have three points on diagonal for call comparison: + // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 + // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 + // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 + // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 + + //////////////////////////////////////////////////////////////////////////////////// + // Process the corner case: for the first diagonal with the index 0 split point + // is equal to (0, 0) regardless of the size and content of the data. + if (__i_elem > 0) { - const _Index __q = __i_elem; //diagonal index - const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(*__res, __q - *__res); + //////////////////////////////////////////////////////////////////////////////////// + // Taking into account the specified constraints of the range of processed data + const auto __index_sum = __i_elem - 1; + + using _IndexSigned = std::make_signed_t<_Index>; + + _IndexSigned idx1_from = __rng1_from; + _IndexSigned idx1_to = __rng1_to; + assert(idx1_from <= idx1_to); + + _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); + _IndexSigned idx2_to = __index_sum - __rng1_from + 1; + assert(idx2_from <= idx2_to); + + const _IndexSigned idx2_from_diff = idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; + + idx1_to -= idx2_from_diff; + idx1_from += idx2_to_diff; + + idx2_from = __index_sum - (idx1_to - 1); + idx2_to = __index_sum - idx1_from + 1; + + assert(idx1_from <= idx1_to); + assert(__rng1_from <= idx1_from && idx1_to <= __rng1_to); + + assert(idx2_from <= idx2_to); + assert(__rng2_from <= idx2_from && idx2_to <= __rng2_to); + + //////////////////////////////////////////////////////////////////////////////////// + // Run search of split point on diagonal + + using __it_t = oneapi::dpl::counting_iterator<_Index>; + + __it_t __diag_it_begin(idx1_from); + __it_t __diag_it_end(idx1_to); + + constexpr int kValue = 1; + const __it_t __res = + std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { + const auto __rng1_idx = __idx; + const auto __rng2_idx = __index_sum - __idx; + + assert(__rng1_from <= __rng1_idx && __rng1_idx < __rng1_to); + assert(__rng2_from <= __rng2_idx && __rng2_idx < __rng2_to); + assert(__rng1_idx + __rng2_idx == __index_sum); + + const auto __zero_or_one = __comp(__rng2[__rng2_idx], __rng1[__rng1_idx]); + return __zero_or_one < kValue; + }); + + const std::pair<_Index, _Index> __result = std::make_pair(*__res, __index_sum - *__res + 1); + assert(__result.first + __result.second == __i_elem); + + assert(__rng1_from <= __result.first && __result.first <= __rng1_to); + assert(__rng2_from <= __result.second && __result.second <= __rng2_to); + + return __result; } else { - const _Index __q = __i_elem - __n2; //diagonal index - const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(__q + *__res, __n2 - *__res); + assert(__rng1_from == 0); + assert(__rng2_from == 0); + return std::make_pair(__rng1_from, __rng2_from); } } +template +std::pair<_Index, _Index> +__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, + const _Index __n2, _Compare __comp) +{ + return __find_start_point_in(__rng1, (_Index)0, __n1, __rng2, (_Index)0, __n2, __i_elem, __comp); +} + // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing // to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) template From 6553c46ad9302b87915e932643e037816ef7c8e8 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 6 Nov 2024 17:07:11 +0100 Subject: [PATCH 002/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - introduce __parallel_merge_submitter_large for merge of biggest data sizes Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index ea0c03d3365..10ddf37fc50 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -232,10 +232,16 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } } +template +class _find_split_points_kernel_on_mid_diagonal; + // Please see the comment for __parallel_for_submitter for optional kernel name explanation template struct __parallel_merge_submitter; +template +struct __parallel_merge_submitter_large; + template struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>> { @@ -269,6 +275,107 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N } }; +template +struct __parallel_merge_submitter_large<_IdType, __internal::__optional_kernel_name<_Name...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + using _FindSplitPointsKernelOnMidDiagonal = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _find_split_points_kernel_on_mid_diagonal, _CustomName, _Range1, _Range2, _IdType, _Compare>; + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + const _IdType __base_diag_count = 1'024 * 32; + const _IdType __base_diag_part = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + + using _split_point_t = std::pair<_IdType, _IdType>; + + using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t>; + __result_and_scratch_storage_t __result_and_scratch{__exec, 0, __base_diag_count + 1}; + + sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); + auto __scratch_acc = __result_and_scratch.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); + + __cgh.parallel_for<_FindSplitPointsKernelOnMidDiagonal>( + sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) + { + auto __global_idx = __item_id.get_linear_id(); + auto __scratch_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); + + if (__global_idx == 0) + { + __scratch_ptr[0] = std::make_pair((_IdType)0, (_IdType)0); + } + else if (__global_idx == __base_diag_count) + { + __scratch_ptr[__base_diag_count] = std::make_pair(__n1, __n2); + } + else + { + const _IdType __i_elem = __global_idx * __base_diag_part * __chunk; + __scratch_ptr[__global_idx] = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + } + }); + }); + + __event = __exec.queue().submit([&](sycl::handler& __cgh) { + + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + auto __scratch_acc = __result_and_scratch.template __get_scratch_acc(__cgh); + + __cgh.depends_on(__event); + + __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __chunk; + + auto __scratch_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); + auto __scratch_idx = __global_idx / __base_diag_part; + + _split_point_t __start; + if (__global_idx % __base_diag_part != 0) + { + // Check that we fit into size of scratch + assert(__scratch_idx + 1 < __base_diag_count + 1); + + const _split_point_t __sp_left = __scratch_ptr[__scratch_idx]; + const _split_point_t __sp_right = __scratch_ptr[__scratch_idx + 1]; + + __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, + __rng2, __sp_left.second, __sp_right.second, + __i_elem, __comp); + } + else + { + __start = __scratch_ptr[__scratch_idx]; + } + + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); + }); + return __future(__event); + } +}; + template class __merge_kernel_name; From 6443f2e903d5e640b3f81b93137003e0fc101e75 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 6 Nov 2024 17:08:24 +0100 Subject: [PATCH 003/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - using __parallel_merge_submitter_large for merge data equal or greater then 4M items Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 77 +++++++++++++------ 1 file changed, 54 insertions(+), 23 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 10ddf37fc50..56fa56aeaec 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -232,14 +232,11 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } } -template -class _find_split_points_kernel_on_mid_diagonal; - // Please see the comment for __parallel_for_submitter for optional kernel name explanation template struct __parallel_merge_submitter; -template +template struct __parallel_merge_submitter_large; template @@ -275,8 +272,14 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N } }; -template -struct __parallel_merge_submitter_large<_IdType, __internal::__optional_kernel_name<_Name...>> +template +class _find_split_points_kernel_on_mid_diagonal_uint32_t; + +template +class _find_split_points_kernel_on_mid_diagonal_uint64_t; + +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_Name...>> { template auto @@ -290,11 +293,12 @@ struct __parallel_merge_submitter_large<_IdType, __internal::__optional_kernel_n _PRINT_INFO_IN_DEBUG_MODE(__exec); - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - - using _FindSplitPointsKernelOnMidDiagonal = + using _FindSplitPointsKernelOnMidDiagonal = std::conditional_t< + std::is_same_v<_IdType, std::uint32_t>, + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _find_split_points_kernel_on_mid_diagonal_uint32_t, _CustomName, _Range1, _Range2, _IdType, _Compare>, oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _find_split_points_kernel_on_mid_diagonal, _CustomName, _Range1, _Range2, _IdType, _Compare>; + _find_split_points_kernel_on_mid_diagonal_uint64_t, _CustomName, _Range1, _Range2, _IdType, _Compare>>; // Empirical number of values to process per work-item const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; @@ -379,6 +383,9 @@ struct __parallel_merge_submitter_large<_IdType, __internal::__optional_kernel_n template class __merge_kernel_name; +template +class __merge_kernel_name_large; + template auto __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, @@ -387,23 +394,47 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; const auto __n = __rng1.size() + __rng2.size(); - if (__n <= std::numeric_limits::max()) + if (__n < 4 * 1'048'576) { - using _WiIndex = std::uint32_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } } else { - using _WiIndex = std::uint64_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _MergeKernelLarge = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _MergeKernelLarge>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _MergeKernelLarge = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _MergeKernelLarge>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } } } From 4c3422b99c60b896211292a8899ea913801fec51 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 7 Nov 2024 15:07:12 +0100 Subject: [PATCH 004/144] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 56fa56aeaec..bbbdeb2a6c7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -120,7 +120,8 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn _IndexSigned idx2_to = __index_sum - __rng1_from + 1; assert(idx2_from <= idx2_to); - const _IndexSigned idx2_from_diff = idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_from_diff = + idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; idx1_to -= idx2_from_diff; @@ -313,16 +314,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti __result_and_scratch_storage_t __result_and_scratch{__exec, 0, __base_diag_count + 1}; sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); auto __scratch_acc = __result_and_scratch.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_FindSplitPointsKernelOnMidDiagonal>( - sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) - { + sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); - auto __scratch_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); + auto __scratch_ptr = + __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); if (__global_idx == 0) { @@ -341,7 +341,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti }); __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __scratch_acc = __result_and_scratch.template __get_scratch_acc(__cgh); @@ -360,12 +359,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti // Check that we fit into size of scratch assert(__scratch_idx + 1 < __base_diag_count + 1); - const _split_point_t __sp_left = __scratch_ptr[__scratch_idx]; + const _split_point_t __sp_left = __scratch_ptr[__scratch_idx]; const _split_point_t __sp_right = __scratch_ptr[__scratch_idx + 1]; - __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, - __rng2, __sp_left.second, __sp_right.second, - __i_elem, __comp); + __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, __sp_left.second, + __sp_right.second, __i_elem, __comp); } else { From afca75a82aa2208aebee673ee987b3f615c3eb78 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 8 Nov 2024 09:39:30 +0100 Subject: [PATCH 005/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix compile error Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index bbbdeb2a6c7..34c8e962765 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -294,12 +294,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti _PRINT_INFO_IN_DEBUG_MODE(__exec); - using _FindSplitPointsKernelOnMidDiagonal = std::conditional_t< - std::is_same_v<_IdType, std::uint32_t>, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _find_split_points_kernel_on_mid_diagonal_uint32_t, _CustomName, _Range1, _Range2, _IdType, _Compare>, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _find_split_points_kernel_on_mid_diagonal_uint64_t, _CustomName, _Range1, _Range2, _IdType, _Compare>>; + using _FindSplitPointsOnMidDiagonalKernel = + std::conditional_t, + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _find_split_points_kernel_on_mid_diagonal_uint32_t, _CustomName, _ExecutionPolicy, + _Range1, _Range2, _Range3, _Compare>, + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _find_split_points_kernel_on_mid_diagonal_uint64_t, _CustomName, _ExecutionPolicy, + _Range1, _Range2, _Range3, _Compare>>; // Empirical number of values to process per work-item const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; @@ -318,7 +320,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti auto __scratch_acc = __result_and_scratch.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); - __cgh.parallel_for<_FindSplitPointsKernelOnMidDiagonal>( + __cgh.parallel_for<_FindSplitPointsOnMidDiagonalKernel>( sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); auto __scratch_ptr = From 3d3fb7d9781234b442579d675bc49bd7a4a447ff Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 8 Nov 2024 20:17:31 +0100 Subject: [PATCH 006/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix Kernel names Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 45 +++++++++---------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 34c8e962765..9aa8c79e011 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -237,9 +237,6 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter_large; - template struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>> { @@ -273,14 +270,13 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N } }; -template -class _find_split_points_kernel_on_mid_diagonal_uint32_t; - -template -class _find_split_points_kernel_on_mid_diagonal_uint64_t; +template +struct __parallel_merge_submitter_large; -template -struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_Name...>> +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName...>> { template auto @@ -294,15 +290,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti _PRINT_INFO_IN_DEBUG_MODE(__exec); - using _FindSplitPointsOnMidDiagonalKernel = - std::conditional_t, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _find_split_points_kernel_on_mid_diagonal_uint32_t, _CustomName, _ExecutionPolicy, - _Range1, _Range2, _Range3, _Compare>, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _find_split_points_kernel_on_mid_diagonal_uint64_t, _CustomName, _ExecutionPolicy, - _Range1, _Range2, _Range3, _Compare>>; - // Empirical number of values to process per work-item const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; @@ -320,7 +307,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti auto __scratch_acc = __result_and_scratch.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); - __cgh.parallel_for<_FindSplitPointsOnMidDiagonalKernel>( + __cgh.parallel_for<_DiagonalsKernelName...>( sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); auto __scratch_ptr = @@ -348,7 +335,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti __cgh.depends_on(__event); - __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __chunk; @@ -383,6 +371,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti template class __merge_kernel_name; +template +class __diagonals_kernel_name; + template class __merge_kernel_name_large; @@ -420,18 +411,22 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy if (__n <= std::numeric_limits::max()) { using _WiIndex = std::uint32_t; - using _MergeKernelLarge = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _MergeKernelLarge>()( + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } else { using _WiIndex = std::uint64_t; - using _MergeKernelLarge = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _MergeKernelLarge>()( + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } From 80cfc42f6401937418d3c878cc8bdb4c8259398a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 8 Nov 2024 20:20:16 +0100 Subject: [PATCH 007/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename template parameter names in __parallel_merge_submitter Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9aa8c79e011..758c09de9ad 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -234,11 +234,11 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } // Please see the comment for __parallel_for_submitter for optional kernel name explanation -template +template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>> +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> { template auto @@ -259,12 +259,13 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); }); return __future(__event); } @@ -390,18 +391,18 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy if (__n <= std::numeric_limits::max()) { using _WiIndex = std::uint32_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } else { using _WiIndex = std::uint64_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } From d9377f3c0a69d3f45c57d5ff3da4f3faa093b88a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 11 Nov 2024 12:28:28 +0100 Subject: [PATCH 008/144] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 43 ++++++++++--------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 758c09de9ad..0c341f31de6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -338,32 +338,33 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __cgh.parallel_for<_MergeKernelName...>( sycl::range(__steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __chunk; + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __chunk; - auto __scratch_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); - auto __scratch_idx = __global_idx / __base_diag_part; + auto __scratch_ptr = + __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); + auto __scratch_idx = __global_idx / __base_diag_part; - _split_point_t __start; - if (__global_idx % __base_diag_part != 0) - { - // Check that we fit into size of scratch - assert(__scratch_idx + 1 < __base_diag_count + 1); + _split_point_t __start; + if (__global_idx % __base_diag_part != 0) + { + // Check that we fit into size of scratch + assert(__scratch_idx + 1 < __base_diag_count + 1); - const _split_point_t __sp_left = __scratch_ptr[__scratch_idx]; - const _split_point_t __sp_right = __scratch_ptr[__scratch_idx + 1]; + const _split_point_t __sp_left = __scratch_ptr[__scratch_idx]; + const _split_point_t __sp_right = __scratch_ptr[__scratch_idx + 1]; - __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, __sp_left.second, - __sp_right.second, __i_elem, __comp); - } - else - { - __start = __scratch_ptr[__scratch_idx]; - } + __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, + __sp_left.second, __sp_right.second, __i_elem, __comp); + } + else + { + __start = __scratch_ptr[__scratch_idx]; + } - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); }); return __future(__event); } From c5923eb1d1c7adcd1c5de556b290504d2c45fbde Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 16:23:14 +0100 Subject: [PATCH 009/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 0c341f31de6..8ae3a4b148b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -389,9 +389,9 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy const auto __n = __rng1.size() + __rng2.size(); if (__n < 4 * 1'048'576) { - if (__n <= std::numeric_limits::max()) + if (__n <= std::numeric_limits::max()) { - using _WiIndex = std::uint32_t; + using _WiIndex = std::uint16_t; using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( @@ -400,7 +400,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy } else { - using _WiIndex = std::uint64_t; + using _WiIndex = std::uint32_t; using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( From 400f695a77553d2090350e5e77bcc2f5e90fdf3b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 09:36:53 +0100 Subject: [PATCH 010/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 8ae3a4b148b..114be82a5ce 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -389,24 +389,12 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy const auto __n = __rng1.size() + __rng2.size(); if (__n < 4 * 1'048'576) { - if (__n <= std::numeric_limits::max()) - { - using _WiIndex = std::uint16_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - using _WiIndex = std::uint32_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } + using _WiIndex = std::uint32_t; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); } else { From 8994a675653cebd27bbdecc6ee99607207f7728e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 15:07:00 +0100 Subject: [PATCH 011/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - introduce __starting_size_limit_for_large_submitter into __parallel_merge Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 114be82a5ce..693a49fde64 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -386,8 +386,10 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - const auto __n = __rng1.size() + __rng2.size(); - if (__n < 4 * 1'048'576) + constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 MB + + const std::size_t __n = __rng1.size() + __rng2.size(); + if (__n < __starting_size_limit_for_large_submitter) { using _WiIndex = std::uint32_t; using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< From d29f5c9eca376b54dcb42f429e9d5555f685170a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 16:31:17 +0100 Subject: [PATCH 012/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - renames Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 693a49fde64..0627624ea5c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -300,39 +300,39 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, using _split_point_t = std::pair<_IdType, _IdType>; - using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t>; - __result_and_scratch_storage_t __result_and_scratch{__exec, 0, __base_diag_count + 1}; + using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t>; + __base_diagonals_sp_storage_t __result_and_scratch{__exec, 0, __base_diag_count + 1}; sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); - auto __scratch_acc = __result_and_scratch.template __get_scratch_acc( + auto __base_diagonals_sp_global_acc = __result_and_scratch.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_DiagonalsKernelName...>( sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); - auto __scratch_ptr = - __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); + auto __base_diagonals_sp_global_ptr = + __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); if (__global_idx == 0) { - __scratch_ptr[0] = std::make_pair((_IdType)0, (_IdType)0); + __base_diagonals_sp_global_ptr[0] = std::make_pair((_IdType)0, (_IdType)0); } else if (__global_idx == __base_diag_count) { - __scratch_ptr[__base_diag_count] = std::make_pair(__n1, __n2); + __base_diagonals_sp_global_ptr[__base_diag_count] = std::make_pair(__n1, __n2); } else { const _IdType __i_elem = __global_idx * __base_diag_part * __chunk; - __scratch_ptr[__global_idx] = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __base_diagonals_sp_global_ptr[__global_idx] = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } }); }); __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - auto __scratch_acc = __result_and_scratch.template __get_scratch_acc(__cgh); + auto __base_diagonals_sp_global_acc = __result_and_scratch.template __get_scratch_acc(__cgh); __cgh.depends_on(__event); @@ -341,25 +341,25 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __chunk; - auto __scratch_ptr = - __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); - auto __scratch_idx = __global_idx / __base_diag_part; + auto __base_diagonals_sp_global_ptr = + __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __diagonal_idx = __global_idx / __base_diag_part; _split_point_t __start; if (__global_idx % __base_diag_part != 0) { // Check that we fit into size of scratch - assert(__scratch_idx + 1 < __base_diag_count + 1); + assert(__diagonal_idx + 1 < __base_diag_count + 1); - const _split_point_t __sp_left = __scratch_ptr[__scratch_idx]; - const _split_point_t __sp_right = __scratch_ptr[__scratch_idx + 1]; + const _split_point_t __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const _split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, __sp_left.second, __sp_right.second, __i_elem, __comp); } else { - __start = __scratch_ptr[__scratch_idx]; + __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; } __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, From 6f3e3e1ed6b92d95e0276c2c23709dab0d3c59c7 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 16:34:13 +0100 Subject: [PATCH 013/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - introduce _split_point_t type Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 0627624ea5c..696719b87ec 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -31,6 +31,8 @@ namespace dpl { namespace __par_backend_hetero { +template +using _split_point_t = std::pair<_Index, _Index>; //Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges //to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: @@ -45,7 +47,7 @@ namespace __par_backend_hetero // | ----> // 3 | 0 0 0 0 0 | template -std::pair<_Index, _Index> +_split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { @@ -158,7 +160,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn return __zero_or_one < kValue; }); - const std::pair<_Index, _Index> __result = std::make_pair(*__res, __index_sum - *__res + 1); + const _split_point_t<_Index> __result{ *__res, __index_sum - *__res + 1 }; assert(__result.first + __result.second == __i_elem); assert(__rng1_from <= __result.first && __result.first <= __rng1_to); @@ -175,7 +177,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn } template -std::pair<_Index, _Index> +_split_point_t<_Index> __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, const _Index __n2, _Compare __comp) { @@ -298,9 +300,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __base_diag_count = 1'024 * 32; const _IdType __base_diag_part = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - using _split_point_t = std::pair<_IdType, _IdType>; - - using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t>; + using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; __base_diagonals_sp_storage_t __result_and_scratch{__exec, 0, __base_diag_count + 1}; sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { @@ -345,14 +345,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); auto __diagonal_idx = __global_idx / __base_diag_part; - _split_point_t __start; + _split_point_t<_IdType> __start; if (__global_idx % __base_diag_part != 0) { // Check that we fit into size of scratch assert(__diagonal_idx + 1 < __base_diag_count + 1); - const _split_point_t __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; - const _split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; + const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, __sp_left.second, __sp_right.second, __i_elem, __comp); From 908b61e2bc1e9d80974c84f40063a1cd83566f8d Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 16:38:23 +0100 Subject: [PATCH 014/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove usages of std::make_pair Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 696719b87ec..9e9591493fc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -19,7 +19,7 @@ #include // std::numeric_limits #include // assert #include // std::uint8_t, ... -#include // std::make_pair, std::forward +#include // std::forward #include // std::min, std::lower_bound #include "sycl_defs.h" @@ -172,7 +172,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn { assert(__rng1_from == 0); assert(__rng2_from == 0); - return std::make_pair(__rng1_from, __rng2_from); + return { __rng1_from, __rng2_from }; } } @@ -316,11 +316,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (__global_idx == 0) { - __base_diagonals_sp_global_ptr[0] = std::make_pair((_IdType)0, (_IdType)0); + __base_diagonals_sp_global_ptr[0] = _split_point_t<_IdType>{ 0, 0 }; } else if (__global_idx == __base_diag_count) { - __base_diagonals_sp_global_ptr[__base_diag_count] = std::make_pair(__n1, __n2); + __base_diagonals_sp_global_ptr[__base_diag_count] = _split_point_t<_IdType>{ __n1, __n2 }; } else { From 262d65b108c5329d69de8662aa4ae3d1e147853d Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 17:09:02 +0100 Subject: [PATCH 015/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - optimize evaluation of split-points on base diagonals Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9e9591493fc..c9c52a5c163 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -314,19 +314,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - if (__global_idx == 0) - { - __base_diagonals_sp_global_ptr[0] = _split_point_t<_IdType>{ 0, 0 }; - } - else if (__global_idx == __base_diag_count) - { - __base_diagonals_sp_global_ptr[__base_diag_count] = _split_point_t<_IdType>{ __n1, __n2 }; - } - else + _split_point_t<_IdType> __sp = __global_idx == 0 ? _split_point_t<_IdType>{ 0, 0 } : _split_point_t<_IdType>{ __n1, __n2 }; + + if (0 < __global_idx && __global_idx < __base_diag_count) { const _IdType __i_elem = __global_idx * __base_diag_part * __chunk; - __base_diagonals_sp_global_ptr[__global_idx] = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } + + __base_diagonals_sp_global_ptr[__global_idx] = __sp; }); }); From 02671e35185a3e5942666c1d576a7eccad808414 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 17:04:25 +0100 Subject: [PATCH 016/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - renames Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index c9c52a5c163..70590c5b69e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -301,11 +301,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __base_diag_part = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; - __base_diagonals_sp_storage_t __result_and_scratch{__exec, 0, __base_diag_count + 1}; + __base_diagonals_sp_storage_t __base_diagonals_sp_global_storage{__exec, 0, __base_diag_count + 1}; sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); - auto __base_diagonals_sp_global_acc = __result_and_scratch.template __get_scratch_acc( + auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_DiagonalsKernelName...>( @@ -328,7 +328,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - auto __base_diagonals_sp_global_acc = __result_and_scratch.template __get_scratch_acc(__cgh); + auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); __cgh.depends_on(__event); From 1825df2dad7e3127aa5bc61646cf49971982c6e9 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 17:10:41 +0100 Subject: [PATCH 017/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - extract eval_split_points_for_groups function Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 90 ++++++++++++++----- 1 file changed, 66 insertions(+), 24 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 70590c5b69e..7ba814a7355 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -281,17 +281,25 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, __internal::__optional_kernel_name<_MergeKernelName...>> { - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; +protected: - assert(__n1 > 0 || __n2 > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); + struct nd_range_params + { + std::size_t base_diag_count = 0; + std::size_t base_diag_part = 0; + std::uint8_t chunk = 0; + _IdType steps = 0; + }; + + template + using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; + + // Calculate nd-range params + template + nd_range_params + eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const + { + const std::size_t __n = __rng1.size() + __rng2.size(); // Empirical number of values to process per work-item const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; @@ -300,8 +308,18 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __base_diag_count = 1'024 * 32; const _IdType __base_diag_part = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; - __base_diagonals_sp_storage_t __base_diagonals_sp_global_storage{__exec, 0, __base_diag_count + 1}; + return { __base_diag_count, __base_diag_part, __chunk, __steps }; + } + + // Calculation of split points on each base diagonal + template + sycl::event + eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, + const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); @@ -309,16 +327,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_DiagonalsKernelName...>( - sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) { + sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); - auto __base_diagonals_sp_global_ptr = - __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); _split_point_t<_IdType> __sp = __global_idx == 0 ? _split_point_t<_IdType>{ 0, 0 } : _split_point_t<_IdType>{ __n1, __n2 }; - if (0 < __global_idx && __global_idx < __base_diag_count) + if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) { - const _IdType __i_elem = __global_idx * __base_diag_part * __chunk; + const _IdType __i_elem = __global_idx * __nd_range_params.base_diag_part * __nd_range_params.chunk; __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } @@ -326,6 +343,31 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, }); }); + return __event; + } + +public: + + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Calculate nd-range params + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); + + __base_diagonals_sp_storage_t<_ExecutionPolicy> __base_diagonals_sp_global_storage{__exec, 0, __nd_range_params.base_diag_count + 1}; + + // Calculation of split points on each base diagonal + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage); + __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); @@ -333,19 +375,19 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __cgh.depends_on(__event); __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __chunk; + const _IdType __i_elem = __global_idx * __nd_range_params.chunk; auto __base_diagonals_sp_global_ptr = - __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - auto __diagonal_idx = __global_idx / __base_diag_part; + __base_diagonals_sp_storage_t<_ExecutionPolicy>::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __diagonal_idx = __global_idx / __nd_range_params.base_diag_part; _split_point_t<_IdType> __start; - if (__global_idx % __base_diag_part != 0) + if (__global_idx % __nd_range_params.base_diag_part != 0) { // Check that we fit into size of scratch - assert(__diagonal_idx + 1 < __base_diag_count + 1); + assert(__diagonal_idx + 1 < __nd_range_params.base_diag_count + 1); const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; @@ -358,7 +400,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; } - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __nd_range_params.chunk, __n1, __n2, __comp); }); }); From 6456fda3d83e6ea1993b585656a67d711adf0dc5 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 17:22:15 +0100 Subject: [PATCH 018/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - extract run_parallel_merge function Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 59 +++++++++++-------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 7ba814a7355..96d3651e33e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -291,9 +291,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _IdType steps = 0; }; - template - using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; - // Calculate nd-range params template nd_range_params @@ -346,27 +343,17 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, return __event; } -public: - - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + // Process parallel merge + template + sycl::event + run_parallel_merge(sycl::event __event, + _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, + const nd_range_params& __nd_range_params, + const _Storage& __base_diagonals_sp_global_storage) const { const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - assert(__n1 > 0 || __n2 > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Calculate nd-range params - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); - - __base_diagonals_sp_storage_t<_ExecutionPolicy> __base_diagonals_sp_global_storage{__exec, 0, __nd_range_params.base_diag_count + 1}; - - // Calculation of split points on each base diagonal - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage); __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); @@ -379,8 +366,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __nd_range_params.chunk; - auto __base_diagonals_sp_global_ptr = - __base_diagonals_sp_storage_t<_ExecutionPolicy>::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); auto __diagonal_idx = __global_idx / __nd_range_params.base_diag_part; _split_point_t<_IdType> __start; @@ -404,6 +390,33 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __comp); }); }); + + return __event; + } + +public: + + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + assert(__rng1.size() > 0 || __rng2.size() > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Calculate nd-range params + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); + + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; + __base_diagonals_sp_storage_t __base_diagonals_sp_global_storage{__exec, 0, __nd_range_params.base_diag_count + 1}; + + // Calculation of split points on each base diagonal + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage); + + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, __base_diagonals_sp_global_storage); + return __future(__event); } }; From 1b0ecd9c46f576e87d9a3f1cf75351b39ac2f7d5 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 17:25:01 +0100 Subject: [PATCH 019/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - using SLM bank size to define chunk in the eval_nd_range_params function Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 96d3651e33e..dcec4586c93 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -296,10 +296,19 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, nd_range_params eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const { + using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), _Range1ValueType, _Range2ValueType>; + const std::size_t __n = __rng1.size() + __rng2.size(); + constexpr std::size_t __slm_bank_size = 32; // TODO is it correct value? How to get it from hardware? + + // Calculate how many data items we can read into one SLM bank + constexpr std::size_t __data_items_in_slm_bank = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); + // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); const _IdType __base_diag_count = 1'024 * 32; From 3a8891f5011b340d00fb5dcaa0b828a5d47cfb82 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 17:43:34 +0100 Subject: [PATCH 020/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - using SLM bank size to define chunk in the eval_nd_range_params function (16) Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index dcec4586c93..87a80239199 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -302,7 +302,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::size_t __n = __rng1.size() + __rng2.size(); - constexpr std::size_t __slm_bank_size = 32; // TODO is it correct value? How to get it from hardware? + constexpr std::size_t __slm_bank_size = 16; // TODO is it correct value? How to get it from hardware? // Calculate how many data items we can read into one SLM bank constexpr std::size_t __data_items_in_slm_bank = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); From de7ab0ba2ecd3753acdffec0314bd56eceee5bb7 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 18:02:33 +0100 Subject: [PATCH 021/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - restore old implementation of __find_start_point Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 54 ++++++++++++++++--- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 87a80239199..26eede8fc70 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -34,6 +34,52 @@ namespace __par_backend_hetero template using _split_point_t = std::pair<_Index, _Index>; +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +auto +__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, + const _Index __n2, _Compare __comp) +{ + //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] + oneapi::dpl::counting_iterator<_Index> __diag_it(0); + + if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed + { + const _Index __q = __i_elem; //diagonal index + const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(*__res, __q - *__res); + } + else + { + const _Index __q = __i_elem - __n2; //diagonal index + const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(__q + *__res, __n2 - *__res); + } +} + //Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges //to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: // 0 1 1 2 3 @@ -176,14 +222,6 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn } } -template -_split_point_t<_Index> -__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, - const _Index __n2, _Compare __comp) -{ - return __find_start_point_in(__rng1, (_Index)0, __n1, __rng2, (_Index)0, __n2, __i_elem, __comp); -} - // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing // to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) template From e9c39fe245f3b50226b237d32ff9ec61d311b7ae Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 3 Dec 2024 17:22:31 +0100 Subject: [PATCH 022/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename: base_diag_part -> steps_between_two_base_diags Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 26eede8fc70..230d7ff5ace 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -324,7 +324,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, struct nd_range_params { std::size_t base_diag_count = 0; - std::size_t base_diag_part = 0; + std::size_t steps_between_two_base_diags = 0; std::uint8_t chunk = 0; _IdType steps = 0; }; @@ -350,9 +350,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); const _IdType __base_diag_count = 1'024 * 32; - const _IdType __base_diag_part = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + const _IdType __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - return { __base_diag_count, __base_diag_part, __chunk, __steps }; + return { __base_diag_count, __steps_between_two_base_diags, __chunk, __steps }; } // Calculation of split points on each base diagonal @@ -379,7 +379,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) { - const _IdType __i_elem = __global_idx * __nd_range_params.base_diag_part * __nd_range_params.chunk; + const _IdType __i_elem = __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } @@ -414,10 +414,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __i_elem = __global_idx * __nd_range_params.chunk; auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - auto __diagonal_idx = __global_idx / __nd_range_params.base_diag_part; + auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; _split_point_t<_IdType> __start; - if (__global_idx % __nd_range_params.base_diag_part != 0) + if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) { // Check that we fit into size of scratch assert(__diagonal_idx + 1 < __nd_range_params.base_diag_count + 1); From 6b4d2cb881b0b230d94822ac1040c00fa55a3ccc Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 08:15:36 +0100 Subject: [PATCH 023/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 230d7ff5ace..b011f6c44ad 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -19,7 +19,7 @@ #include // std::numeric_limits #include // assert #include // std::uint8_t, ... -#include // std::forward +#include // std::make_pair, std::forward #include // std::min, std::lower_bound #include "sycl_defs.h" From b29c080d356efbf1d36e69bacdf1d8a01dad154f Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 10:59:06 +0100 Subject: [PATCH 024/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix an error in __parallel_merge_submitter_large::eval_split_points_for_groups Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index b011f6c44ad..50248c2cf16 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -364,6 +364,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, { const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); @@ -380,7 +381,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) { const _IdType __i_elem = __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; - __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + if (__i_elem < __n) + __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } __base_diagonals_sp_global_ptr[__global_idx] = __sp; From 6f54078b7c9a6f899e07736a095d3710ea13c4d8 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 11:08:03 +0100 Subject: [PATCH 025/144] Fix an error: the life time of storage with split points on base diagonals is too short Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 13 ++++++++----- .../pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 8 +++++++- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 50248c2cf16..d1cd047cdf2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -307,7 +307,9 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M __comp); }); }); - return __future(__event); + // We should return the same thing in the second param of __future for compatibility + // with the returning value in __parallel_merge_submitter_large::operator() + return __future(__event, __result_and_scratch_storage_base_ptr{}); } }; @@ -458,15 +460,16 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; - __base_diagonals_sp_storage_t __base_diagonals_sp_global_storage{__exec, 0, __nd_range_params.base_diag_count + 1}; + auto __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params.base_diag_count + 1); + __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base(static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); // Calculation of split points on each base diagonal - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage); + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, __base_diagonals_sp_global_storage); + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); - return __future(__event); + return __future(__event, std::move(__p_result_and_scratch_storage_base)); } }; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index f4eb557170e..e66e8c28089 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -518,8 +518,14 @@ struct __usm_or_buffer_accessor } }; +struct __result_and_scratch_storage_base +{ + virtual ~__result_and_scratch_storage_base() = default; +}; +using __result_and_scratch_storage_base_ptr = std::shared_ptr<__result_and_scratch_storage_base>; + template -struct __result_and_scratch_storage +struct __result_and_scratch_storage : __result_and_scratch_storage_base { private: using __sycl_buffer_t = sycl::buffer<_T, 1>; From 4292c6c89478e71532395a576c546fdc8d54c278 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 17:20:43 +0100 Subject: [PATCH 026/144] Combine two submitters `__parallel_merge_submitter` and `__parallel_merge_submitter_large` into one `__parallel_merge_submitter` (#1956) --- .../dpcpp/parallel_backend_sycl_merge.h | 174 +++++++++--------- 1 file changed, 87 insertions(+), 87 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index d1cd047cdf2..d0379b07c99 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -273,53 +273,14 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } } -// Please see the comment for __parallel_for_submitter for optional kernel name explanation -template +template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> -{ - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - assert(__n1 > 0 || __n2 > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - - auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); - }); - // We should return the same thing in the second param of __future for compatibility - // with the returning value in __parallel_merge_submitter_large::operator() - return __future(__event, __result_and_scratch_storage_base_ptr{}); - } -}; - -template -struct __parallel_merge_submitter_large; - -template -struct __parallel_merge_submitter_large<_IdType, _CustomName, +template +struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, - __internal::__optional_kernel_name<_MergeKernelName...>> + __internal::__optional_kernel_name<_MergeKernelName1...>, + __internal::__optional_kernel_name<_MergeKernelName2...>> { protected: @@ -351,8 +312,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - const _IdType __base_diag_count = 1'024 * 32; - const _IdType __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + const _IdType __base_diag_count = __use_base_diags ? 32 * 1'024 : 0; + const _IdType __steps_between_two_base_diags = __use_base_diags ? oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count) : 0; return { __base_diag_count, __steps_between_two_base_diags, __chunk, __steps }; } @@ -394,6 +355,33 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, return __event; } + // Process parallel merge + template + sycl::event + run_parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, + const nd_range_params& __nd_range_params) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + + const auto __chunk = __nd_range_params.chunk; + + sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + + __cgh.parallel_for<_MergeKernelName1...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __chunk; + + _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, __comp); + }); + }); + + return __event; + } + // Process parallel merge template @@ -412,7 +400,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __cgh.depends_on(__event); - __cgh.parallel_for<_MergeKernelName...>( + __cgh.parallel_for<_MergeKernelName2...>( sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __nd_range_params.chunk; @@ -437,8 +425,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; } - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __nd_range_params.chunk, __n1, __n2, - __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __nd_range_params.chunk, __n1, __n2, __comp); }); }); @@ -447,6 +434,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, public: + __parallel_merge_submitter(bool __use_base_diags) + : __use_base_diags(__use_base_diags) + { + } + template auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const @@ -458,29 +450,43 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Calculate nd-range params const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); - // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; - auto __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params.base_diag_count + 1); - __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base(static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; // Calculation of split points on each base diagonal - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); + sycl::event __event; + if (__use_base_diags) + { + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + auto __p_base_diagonals_sp_global_storage = new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>(__exec, 0, __nd_range_params.base_diag_count + 1); + __p_result_and_scratch_storage_base.reset(static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); + __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); - return __future(__event, std::move(__p_result_and_scratch_storage_base)); + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); + } + else + { + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__exec, __rng1, __rng2, __rng3, __comp, __nd_range_params); + } + + return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); } + +private: + + const bool __use_base_diags = false; }; template -class __merge_kernel_name; +class __merge_kernel_name1; template -class __diagonals_kernel_name; +class __merge_kernel_name2; template -class __merge_kernel_name_large; +class __diagonals_kernel_name; template auto @@ -489,42 +495,36 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + const std::size_t __n = __rng1.size() + __rng2.size(); + constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 MB + const bool __use_base_diags = __n >= __starting_size_limit_for_large_submitter; - const std::size_t __n = __rng1.size() + __rng2.size(); - if (__n < __starting_size_limit_for_large_submitter) + if (__n <= std::numeric_limits::max()) { using _WiIndex = std::uint32_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName1 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name1<_CustomName, _WiIndex>>; + using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name2<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, _MergeKernelName2>(__use_base_diags)( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } else { - if (__n <= std::numeric_limits::max()) - { - using _WiIndex = std::uint32_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - using _WiIndex = std::uint64_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } + using _WiIndex = std::uint64_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName1 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name1<_CustomName, _WiIndex>>; + using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name2<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, _MergeKernelName2>(__use_base_diags)( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); } } From 6ad8170b7fd4e48e9af20fabf9317937d7a84099 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 18:32:58 +0100 Subject: [PATCH 027/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: remove extra condition check from __find_start_point_in Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 98 +++++++++---------- 1 file changed, 44 insertions(+), 54 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index d0379b07c99..fb19b08a609 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -107,7 +107,9 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn if constexpr (!std::is_pointer_v<_Rng2>) assert(__rng2_to <= __rng2.size()); - assert(__i_elem >= 0); + // We shouldn't call this function with __i_elem == 0 because we a priory know that + // split point for this case is {0, 0} + assert(__i_elem > 0); // ----------------------- EXAMPLE ------------------------ // Let's consider the following input data: @@ -150,76 +152,64 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 //////////////////////////////////////////////////////////////////////////////////// - // Process the corner case: for the first diagonal with the index 0 split point - // is equal to (0, 0) regardless of the size and content of the data. - if (__i_elem > 0) - { - //////////////////////////////////////////////////////////////////////////////////// - // Taking into account the specified constraints of the range of processed data - const auto __index_sum = __i_elem - 1; + // Taking into account the specified constraints of the range of processed data + const auto __index_sum = __i_elem - 1; - using _IndexSigned = std::make_signed_t<_Index>; + using _IndexSigned = std::make_signed_t<_Index>; - _IndexSigned idx1_from = __rng1_from; - _IndexSigned idx1_to = __rng1_to; - assert(idx1_from <= idx1_to); + _IndexSigned idx1_from = __rng1_from; + _IndexSigned idx1_to = __rng1_to; + assert(idx1_from <= idx1_to); - _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); - _IndexSigned idx2_to = __index_sum - __rng1_from + 1; - assert(idx2_from <= idx2_to); + _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); + _IndexSigned idx2_to = __index_sum - __rng1_from + 1; + assert(idx2_from <= idx2_to); - const _IndexSigned idx2_from_diff = - idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; - const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; + const _IndexSigned idx2_from_diff = + idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; - idx1_to -= idx2_from_diff; - idx1_from += idx2_to_diff; + idx1_to -= idx2_from_diff; + idx1_from += idx2_to_diff; - idx2_from = __index_sum - (idx1_to - 1); - idx2_to = __index_sum - idx1_from + 1; + idx2_from = __index_sum - (idx1_to - 1); + idx2_to = __index_sum - idx1_from + 1; - assert(idx1_from <= idx1_to); - assert(__rng1_from <= idx1_from && idx1_to <= __rng1_to); + assert(idx1_from <= idx1_to); + assert(__rng1_from <= idx1_from && idx1_to <= __rng1_to); - assert(idx2_from <= idx2_to); - assert(__rng2_from <= idx2_from && idx2_to <= __rng2_to); + assert(idx2_from <= idx2_to); + assert(__rng2_from <= idx2_from && idx2_to <= __rng2_to); - //////////////////////////////////////////////////////////////////////////////////// - // Run search of split point on diagonal + //////////////////////////////////////////////////////////////////////////////////// + // Run search of split point on diagonal - using __it_t = oneapi::dpl::counting_iterator<_Index>; + using __it_t = oneapi::dpl::counting_iterator<_Index>; - __it_t __diag_it_begin(idx1_from); - __it_t __diag_it_end(idx1_to); + __it_t __diag_it_begin(idx1_from); + __it_t __diag_it_end(idx1_to); - constexpr int kValue = 1; - const __it_t __res = - std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { - const auto __rng1_idx = __idx; - const auto __rng2_idx = __index_sum - __idx; + constexpr int kValue = 1; + const __it_t __res = + std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { + const auto __rng1_idx = __idx; + const auto __rng2_idx = __index_sum - __idx; - assert(__rng1_from <= __rng1_idx && __rng1_idx < __rng1_to); - assert(__rng2_from <= __rng2_idx && __rng2_idx < __rng2_to); - assert(__rng1_idx + __rng2_idx == __index_sum); + assert(__rng1_from <= __rng1_idx && __rng1_idx < __rng1_to); + assert(__rng2_from <= __rng2_idx && __rng2_idx < __rng2_to); + assert(__rng1_idx + __rng2_idx == __index_sum); - const auto __zero_or_one = __comp(__rng2[__rng2_idx], __rng1[__rng1_idx]); - return __zero_or_one < kValue; - }); + const auto __zero_or_one = __comp(__rng2[__rng2_idx], __rng1[__rng1_idx]); + return __zero_or_one < kValue; + }); - const _split_point_t<_Index> __result{ *__res, __index_sum - *__res + 1 }; - assert(__result.first + __result.second == __i_elem); + const _split_point_t<_Index> __result{ *__res, __index_sum - *__res + 1 }; + assert(__result.first + __result.second == __i_elem); - assert(__rng1_from <= __result.first && __result.first <= __rng1_to); - assert(__rng2_from <= __result.second && __result.second <= __rng2_to); + assert(__rng1_from <= __result.first && __result.first <= __rng1_to); + assert(__rng2_from <= __result.second && __result.second <= __rng2_to); - return __result; - } - else - { - assert(__rng1_from == 0); - assert(__rng2_from == 0); - return { __rng1_from, __rng2_from }; - } + return __result; } // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing From 6dd39e7f43907d460340d5175d06b0af52116661 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 18:39:19 +0100 Subject: [PATCH 028/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: fix condition check in __find_start_point_in Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index fb19b08a609..296db3927f4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -97,15 +97,8 @@ _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - assert(__rng1_from <= __rng1_to); - assert(__rng2_from <= __rng2_to); - - assert(__rng1_to > 0 || __rng2_to > 0); - - if constexpr (!std::is_pointer_v<_Rng1>) - assert(__rng1_to <= __rng1.size()); - if constexpr (!std::is_pointer_v<_Rng2>) - assert(__rng2_to <= __rng2.size()); + assert(0 <= __rng1_from && __rng1_from < __rng1_to && __rng1_to < __rng1.size()); + assert(0 <= __rng2_from && __rng2_from < __rng2_to && __rng2_to < __rng2.size()); // We shouldn't call this function with __i_elem == 0 because we a priory know that // split point for this case is {0, 0} From 1b7de915049ab236888b1f8f38e08ef6426b272e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 18:43:56 +0100 Subject: [PATCH 029/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 98 +++++++++++-------- 1 file changed, 55 insertions(+), 43 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 296db3927f4..5ce69d3f342 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -196,7 +196,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn return __zero_or_one < kValue; }); - const _split_point_t<_Index> __result{ *__res, __index_sum - *__res + 1 }; + const _split_point_t<_Index> __result{*__res, __index_sum - *__res + 1}; assert(__result.first + __result.second == __i_elem); assert(__rng1_from <= __result.first && __result.first <= __rng1_to); @@ -256,23 +256,23 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } } -template +template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter<_IdType, _CustomName, - __internal::__optional_kernel_name<_DiagonalsKernelName...>, - __internal::__optional_kernel_name<_MergeKernelName1...>, - __internal::__optional_kernel_name<_MergeKernelName2...>> +template +struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName1...>, + __internal::__optional_kernel_name<_MergeKernelName2...>> { -protected: - + protected: struct nd_range_params { - std::size_t base_diag_count = 0; - std::size_t steps_between_two_base_diags = 0; + std::size_t base_diag_count = 0; + std::size_t steps_between_two_base_diags = 0; std::uint8_t chunk = 0; - _IdType steps = 0; + _IdType steps = 0; }; // Calculate nd-range params @@ -282,23 +282,26 @@ struct __parallel_merge_submitter<_IdType, _CustomName, { using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; - using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), _Range1ValueType, _Range2ValueType>; + using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), + _Range1ValueType, _Range2ValueType>; const std::size_t __n = __rng1.size() + __rng2.size(); - constexpr std::size_t __slm_bank_size = 16; // TODO is it correct value? How to get it from hardware? + constexpr std::size_t __slm_bank_size = 16; // TODO is it correct value? How to get it from hardware? // Calculate how many data items we can read into one SLM bank - constexpr std::size_t __data_items_in_slm_bank = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); + constexpr std::size_t __data_items_in_slm_bank = + oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); // Empirical number of values to process per work-item const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); const _IdType __base_diag_count = __use_base_diags ? 32 * 1'024 : 0; - const _IdType __steps_between_two_base_diags = __use_base_diags ? oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count) : 0; + const _IdType __steps_between_two_base_diags = + __use_base_diags ? oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count) : 0; - return { __base_diag_count, __steps_between_two_base_diags, __chunk, __steps }; + return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; } // Calculation of split points on each base diagonal @@ -314,19 +317,23 @@ struct __parallel_merge_submitter<_IdType, _CustomName, sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); - auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( - __cgh, __dpl_sycl::__no_init{}); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_DiagonalsKernelName...>( sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); - auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - _split_point_t<_IdType> __sp = __global_idx == 0 ? _split_point_t<_IdType>{ 0, 0 } : _split_point_t<_IdType>{ __n1, __n2 }; + _split_point_t<_IdType> __sp = + __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) { - const _IdType __i_elem = __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; + const _IdType __i_elem = + __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; if (__i_elem < __n) __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } @@ -358,7 +365,8 @@ struct __parallel_merge_submitter<_IdType, _CustomName, const _IdType __i_elem = __global_idx * __chunk; _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); }); }); @@ -369,9 +377,8 @@ struct __parallel_merge_submitter<_IdType, _CustomName, template sycl::event - run_parallel_merge(sycl::event __event, - _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, - const nd_range_params& __nd_range_params, + run_parallel_merge(sycl::event __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, + _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, const _Storage& __base_diagonals_sp_global_storage) const { const _IdType __n1 = __rng1.size(); @@ -379,7 +386,8 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); __cgh.depends_on(__event); @@ -388,7 +396,8 @@ struct __parallel_merge_submitter<_IdType, _CustomName, auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __nd_range_params.chunk; - auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; _split_point_t<_IdType> __start; @@ -408,19 +417,16 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; } - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __nd_range_params.chunk, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, + __nd_range_params.chunk, __n1, __n2, __comp); }); }); return __event; } -public: - - __parallel_merge_submitter(bool __use_base_diags) - : __use_base_diags(__use_base_diags) - { - } + public: + __parallel_merge_submitter(bool __use_base_diags) : __use_base_diags(__use_base_diags) {} template auto @@ -440,13 +446,18 @@ struct __parallel_merge_submitter<_IdType, _CustomName, if (__use_base_diags) { // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - auto __p_base_diagonals_sp_global_storage = new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>(__exec, 0, __nd_range_params.base_diag_count + 1); - __p_result_and_scratch_storage_base.reset(static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + auto __p_base_diagonals_sp_global_storage = + new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( + __exec, 0, __nd_range_params.base_diag_count + 1); + __p_result_and_scratch_storage_base.reset( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); + __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); } else { @@ -457,8 +468,7 @@ struct __parallel_merge_submitter<_IdType, _CustomName, return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); } -private: - + private: const bool __use_base_diags = false; }; @@ -492,7 +502,8 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy __merge_kernel_name1<_CustomName, _WiIndex>>; using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name2<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, _MergeKernelName2>(__use_base_diags)( + return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, + _MergeKernelName2>(__use_base_diags)( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } @@ -505,7 +516,8 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy __merge_kernel_name1<_CustomName, _WiIndex>>; using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name2<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, _MergeKernelName2>(__use_base_diags)( + return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, + _MergeKernelName2>(__use_base_diags)( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } From 8e50bbfa10da9810efa06255dc154e0c4f4e6fb2 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 19:44:38 +0100 Subject: [PATCH 030/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix compile error in sort.pass.cpp Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 5ce69d3f342..a583863e65a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -97,8 +97,16 @@ _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - assert(0 <= __rng1_from && __rng1_from < __rng1_to && __rng1_to < __rng1.size()); - assert(0 <= __rng2_from && __rng2_from < __rng2_to && __rng2_to < __rng2.size()); + if constexpr (!std::is_pointer_v<_Rng1> && !std::is_pointer_v<_Rng2>) + { + assert(0 <= __rng1_from && __rng1_from < __rng1_to && __rng1_to < __rng1.size()); + assert(0 <= __rng2_from && __rng2_from < __rng2_to && __rng2_to < __rng2.size()); + } + else + { + assert(0 <= __rng1_from && __rng1_from < __rng1_to); + assert(0 <= __rng2_from && __rng2_from < __rng2_to); + } // We shouldn't call this function with __i_elem == 0 because we a priory know that // split point for this case is {0, 0} From f0ea19df47f4b2ae1402fad9d7a0d38f47560f3e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 6 Dec 2024 18:24:46 +0100 Subject: [PATCH 031/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - processing additional corner cases in __find_start_point_in Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index a583863e65a..491bc467ee5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -97,16 +97,16 @@ _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - if constexpr (!std::is_pointer_v<_Rng1> && !std::is_pointer_v<_Rng2>) - { - assert(0 <= __rng1_from && __rng1_from < __rng1_to && __rng1_to < __rng1.size()); - assert(0 <= __rng2_from && __rng2_from < __rng2_to && __rng2_to < __rng2.size()); - } - else - { - assert(0 <= __rng1_from && __rng1_from < __rng1_to); - assert(0 <= __rng2_from && __rng2_from < __rng2_to); - } + assert(__rng1_from + __rng2_from <= __i_elem && __i_elem <= __rng1_to + __rng2_to); + + if (__i_elem == 0) + return _split_point_t<_Index>{ 0, 0 }; + + if (__rng1_from == __rng1_to) + return _split_point_t<_Index>{ __rng1_from, __rng2_from + __i_elem }; + + if (__rng2_from == __rng2_to) + return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_to }; // We shouldn't call this function with __i_elem == 0 because we a priory know that // split point for this case is {0, 0} From f327800a342818dcf5ac5e287b836f775b064ed0 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 6 Dec 2024 18:25:18 +0100 Subject: [PATCH 032/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix an error in run_parallel_merge Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 491bc467ee5..9d352625411 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -372,9 +372,12 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __chunk; - _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); + if (__i_elem < __n1 + __n2) + { + _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + } }); }); From 53def33cae31bbe7e372405bb1b7f365708c83e3 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 6 Dec 2024 19:33:43 +0100 Subject: [PATCH 033/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: remove assert calls from Kernel code Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9d352625411..dc9da9cc0b3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -97,8 +97,6 @@ _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - assert(__rng1_from + __rng2_from <= __i_elem && __i_elem <= __rng1_to + __rng2_to); - if (__i_elem == 0) return _split_point_t<_Index>{ 0, 0 }; @@ -108,10 +106,6 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn if (__rng2_from == __rng2_to) return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_to }; - // We shouldn't call this function with __i_elem == 0 because we a priory know that - // split point for this case is {0, 0} - assert(__i_elem > 0); - // ----------------------- EXAMPLE ------------------------ // Let's consider the following input data: // rng1.size() = 10 @@ -160,11 +154,9 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn _IndexSigned idx1_from = __rng1_from; _IndexSigned idx1_to = __rng1_to; - assert(idx1_from <= idx1_to); _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); _IndexSigned idx2_to = __index_sum - __rng1_from + 1; - assert(idx2_from <= idx2_to); const _IndexSigned idx2_from_diff = idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; @@ -176,12 +168,6 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn idx2_from = __index_sum - (idx1_to - 1); idx2_to = __index_sum - idx1_from + 1; - assert(idx1_from <= idx1_to); - assert(__rng1_from <= idx1_from && idx1_to <= __rng1_to); - - assert(idx2_from <= idx2_to); - assert(__rng2_from <= idx2_from && idx2_to <= __rng2_to); - //////////////////////////////////////////////////////////////////////////////////// // Run search of split point on diagonal @@ -196,20 +182,11 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn const auto __rng1_idx = __idx; const auto __rng2_idx = __index_sum - __idx; - assert(__rng1_from <= __rng1_idx && __rng1_idx < __rng1_to); - assert(__rng2_from <= __rng2_idx && __rng2_idx < __rng2_to); - assert(__rng1_idx + __rng2_idx == __index_sum); - const auto __zero_or_one = __comp(__rng2[__rng2_idx], __rng1[__rng1_idx]); return __zero_or_one < kValue; }); const _split_point_t<_Index> __result{*__res, __index_sum - *__res + 1}; - assert(__result.first + __result.second == __i_elem); - - assert(__rng1_from <= __result.first && __result.first <= __rng1_to); - assert(__rng2_from <= __result.second && __result.second <= __rng2_to); - return __result; } @@ -414,9 +391,6 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k _split_point_t<_IdType> __start; if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) { - // Check that we fit into size of scratch - assert(__diagonal_idx + 1 < __nd_range_params.base_diag_count + 1); - const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; From d8d6e7419b19f084b734ebd6d7dd9417d60019df Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 6 Dec 2024 19:34:57 +0100 Subject: [PATCH 034/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove extra local variables in __find_start_point_in Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index dc9da9cc0b3..8cd262a4be2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -179,15 +179,11 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn constexpr int kValue = 1; const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { - const auto __rng1_idx = __idx; - const auto __rng2_idx = __index_sum - __idx; - - const auto __zero_or_one = __comp(__rng2[__rng2_idx], __rng1[__rng1_idx]); + const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); return __zero_or_one < kValue; }); - const _split_point_t<_Index> __result{*__res, __index_sum - *__res + 1}; - return __result; + return {*__res, __index_sum - *__res + 1}; } // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing From f67503f555557357c3357334783adf91bdabeb82 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 8 Dec 2024 15:03:38 +0100 Subject: [PATCH 035/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - processing additional corner cases in __find_start_point_in Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 8cd262a4be2..be1048d81b5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -104,7 +104,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn return _split_point_t<_Index>{ __rng1_from, __rng2_from + __i_elem }; if (__rng2_from == __rng2_to) - return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_to }; + return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_from }; // ----------------------- EXAMPLE ------------------------ // Let's consider the following input data: From 3089f711d20e084006a35d5e4adbe9d8233d2fdc Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 10 Dec 2024 09:28:22 +0100 Subject: [PATCH 036/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove extra condition checks from __find_start_point_in Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index be1048d81b5..ed4c7df1ded 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -97,15 +97,6 @@ _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - if (__i_elem == 0) - return _split_point_t<_Index>{ 0, 0 }; - - if (__rng1_from == __rng1_to) - return _split_point_t<_Index>{ __rng1_from, __rng2_from + __i_elem }; - - if (__rng2_from == __rng2_to) - return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_from }; - // ----------------------- EXAMPLE ------------------------ // Let's consider the following input data: // rng1.size() = 10 From 263a09d16f150441a439a1c7cf3ef45862646f17 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 9 Dec 2024 19:57:23 +0100 Subject: [PATCH 037/144] new implementation of __merge_sort_global_submitter - V1 --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 3 + .../dpcpp/parallel_backend_sycl_merge.h | 34 +- .../dpcpp/parallel_backend_sycl_merge_sort.h | 457 ++++++++++++++++-- 3 files changed, 442 insertions(+), 52 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 68dd00188dd..ff8a5367703 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -2047,6 +2047,9 @@ struct __parallel_partial_sort_submitter<__internal::__optional_kernel_name<_Glo } }; +template +class __sort_global_kernel; + template auto __parallel_partial_sort_impl(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range&& __rng, diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 8cd262a4be2..9d352625411 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -97,6 +97,8 @@ _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { + assert(__rng1_from + __rng2_from <= __i_elem && __i_elem <= __rng1_to + __rng2_to); + if (__i_elem == 0) return _split_point_t<_Index>{ 0, 0 }; @@ -106,6 +108,10 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn if (__rng2_from == __rng2_to) return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_to }; + // We shouldn't call this function with __i_elem == 0 because we a priory know that + // split point for this case is {0, 0} + assert(__i_elem > 0); + // ----------------------- EXAMPLE ------------------------ // Let's consider the following input data: // rng1.size() = 10 @@ -154,9 +160,11 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn _IndexSigned idx1_from = __rng1_from; _IndexSigned idx1_to = __rng1_to; + assert(idx1_from <= idx1_to); _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); _IndexSigned idx2_to = __index_sum - __rng1_from + 1; + assert(idx2_from <= idx2_to); const _IndexSigned idx2_from_diff = idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; @@ -168,6 +176,12 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn idx2_from = __index_sum - (idx1_to - 1); idx2_to = __index_sum - idx1_from + 1; + assert(idx1_from <= idx1_to); + assert(__rng1_from <= idx1_from && idx1_to <= __rng1_to); + + assert(idx2_from <= idx2_to); + assert(__rng2_from <= idx2_from && idx2_to <= __rng2_to); + //////////////////////////////////////////////////////////////////////////////////// // Run search of split point on diagonal @@ -179,11 +193,24 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn constexpr int kValue = 1; const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { - const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); + const auto __rng1_idx = __idx; + const auto __rng2_idx = __index_sum - __idx; + + assert(__rng1_from <= __rng1_idx && __rng1_idx < __rng1_to); + assert(__rng2_from <= __rng2_idx && __rng2_idx < __rng2_to); + assert(__rng1_idx + __rng2_idx == __index_sum); + + const auto __zero_or_one = __comp(__rng2[__rng2_idx], __rng1[__rng1_idx]); return __zero_or_one < kValue; }); - return {*__res, __index_sum - *__res + 1}; + const _split_point_t<_Index> __result{*__res, __index_sum - *__res + 1}; + assert(__result.first + __result.second == __i_elem); + + assert(__rng1_from <= __result.first && __result.first <= __rng1_to); + assert(__rng2_from <= __result.second && __result.second <= __rng2_to); + + return __result; } // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing @@ -387,6 +414,9 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k _split_point_t<_IdType> __start; if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) { + // Check that we fit into size of scratch + assert(__diagonal_idx + 1 < __nd_range_params.base_diag_count + 1); + const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 0765f8ef7bc..8cf077b2a8f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -22,6 +22,7 @@ #include // std::uint32_t, ... #include // std::min, std::max_element #include // std::decay_t, std::integral_constant +#include #include "sycl_defs.h" // __dpl_sycl::__local_accessor, __dpl_sycl::__group_barrier #include "sycl_traits.h" // SYCL traits specialization for some oneDPL types. @@ -178,7 +179,7 @@ struct __leaf_sorter // 3. Sort on work-group level bool __data_in_temp = __group_sorter.sort(__item, __storage_acc, __comp, static_cast(0), __adjusted_process_size, - /*sorted per sub-group*/ __data_per_workitem, __data_per_workitem, __workgroup_size); + /*sorted per sub-group*/ __data_per_workitem, __data_per_workitem, __workgroup_size); // barrier is not needed here because of the barrier inside the sort method // 4. Store @@ -227,69 +228,416 @@ struct __merge_sort_leaf_submitter<__internal::__optional_kernel_name<_LeafSortN } }; -template +template struct __merge_sort_global_submitter; -template -struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name<_GlobalSortName...>> +template +struct __merge_sort_global_submitter<_IndexT, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_GlobalSortName1...>, + __internal::__optional_kernel_name<_GlobalSortName2...>> { - template - std::pair - operator()(sycl::queue& __q, _Range& __rng, _Compare __comp, _LeafSizeT __leaf_size, _TempBuf& __temp_buf, - sycl::event __event_chain) const +protected: + + using _merge_split_point_t = _split_point_t<_IndexT>; + + static constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 MB + + struct nd_range_params { - const _IndexT __n = __rng.size(); - _IndexT __n_sorted = __leaf_size; - const bool __is_cpu = __q.get_device().is_cpu(); + std::size_t base_diag_count = 0; + std::size_t steps_between_two_base_diags = 0; + std::uint32_t chunk = 0; + std::size_t steps = 0; + }; + + struct WorkDataArea + { + // How WorkDataArea is implemented : + // + // i_elem_local + // | + // offset | i_elem + // | | | + // V V V + // +------+-------+------+-----+ + // | | | / | + // | | | / | + // | | | / | + // | | | / | + // | | | / | + // | | | / | + // offset -> +------+---n1--+ <----+---- whole data area : size == __n + // | | /| | + // | | <-/-+------------+---- working data area : sizeof(rng1) <= __n_sorted, sizeof(rng2) <= __n_sorted + // | | / | | + // | n2 / | | + // | | / | | + // | | / | | + // | |/ | | + // i_elem_local -> +------+-------+ | + // | / | + // | / | + // | / | + // | / | + // | / | + // i_elem -> +/ | + // | | + // | | + // | | + // | | + // | | + // +---------------------------+ + + _IndexT i_elem = 0; // Global diagonal index + _IndexT i_elem_local = 0; // Local diagonal index + _IndexT offset = 0; // Offset to the first element in the subrange (i.e. the first element of the first subrange for merge) + _IndexT n1 = 0; // Size of the first subrange + _IndexT n2 = 0; // Size of the second subrange + + WorkDataArea(const std::size_t __n, const std::size_t __n_sorted, + const std::size_t __linear_id, + const std::size_t __chunk) + { + // Calculate global diagonal index + i_elem = __linear_id * __chunk; + + // Calculate local diagonal index + i_elem_local = i_elem % (__n_sorted * 2); + + // Calculate offset to the first element in the subrange (i.e. the first element of the first subrange for merge) + offset = std::min<_IndexT>(i_elem - i_elem_local, __n); + + // Calculate size of the first and the second subranges + n1 = std::min<_IndexT>(offset + __n_sorted, __n) - offset; + n2 = std::min<_IndexT>(offset + __n_sorted + n1, __n) - (offset + n1); + } + + inline bool + is_i_elem_local_inside_merge_matrix() const + { + return i_elem_local < n1 + n2; + } + }; + + template + struct DropViews + { + using __drop_view_simple_t = oneapi::dpl::__ranges::drop_view_simple; + + __drop_view_simple_t rng1; + __drop_view_simple_t rng2; + + DropViews(Rng& __rng, const WorkDataArea& __data_area) + : rng1(__rng, __data_area.offset) + , rng2(__rng, __data_area.offset + __data_area.n1) + {} + }; + + // Calculate nd-range params + template + nd_range_params + eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size, _IndexT __n_sorted) const + { + const bool __is_cpu = __exec.queue().get_device().is_cpu(); const std::uint32_t __chunk = __is_cpu ? 32 : 4; - const std::size_t __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - bool __data_in_temp = false; + const std::size_t __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_size, __chunk); - const std::size_t __n_power2 = oneapi::dpl::__internal::__dpl_bit_ceil(__n); - // ctz precisely calculates log2 of an integral value which is a power of 2, while - // std::log2 may be prone to rounding errors on some architectures - const std::int64_t __n_iter = sycl::ctz(__n_power2) - sycl::ctz(__leaf_size); - for (std::int64_t __i = 0; __i < __n_iter; ++__i) + _IndexT __base_diag_count = 32 * 1'024; // 32 Kb + + while (__n_sorted <= __base_diag_count) + __n_sorted = __n_sorted * 2; + __base_diag_count = __n_sorted / 2; + + _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + + return { __base_diag_count, __steps_between_two_base_diags, __chunk, __steps }; + } + + template + inline + static _merge_split_point_t __find_start_point_w(const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp) + { + return __find_start_point(__views.rng1, __views.rng2, __data_area.i_elem_local, __data_area.n1, __data_area.n2, __comp); + } + + template + inline + static void __serial_merge_w(const nd_range_params& __nd_range_params, + const WorkDataArea& __data_area, + const DropViews& __views, _Rng& __rng, + const _merge_split_point_t& __sp, + _Compare __comp) + { + __serial_merge(__views.rng1, __views.rng2, __rng /* rng3 */, + __sp.first /* start1 */, __sp.second /* start2 */, __data_area.i_elem /* start3 */, + __nd_range_params.chunk, + __data_area.n1, __data_area.n2, + __comp); + } + + // Calculation of split points on each base diagonal + template + sycl::event + eval_split_points_for_groups(const sycl::event& __event_chain, + const _IndexT __n_sorted, const bool __data_in_temp, + _ExecutionPolicy&& __exec, _Range&& __rng, _TempBuf& __temp_buf, _Compare __comp, + const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const + { + const _IndexT __n = __rng.size(); + + return __exec.queue().submit([&, __event_chain](sycl::handler& __cgh) { + + __cgh.depends_on(__event_chain); + + oneapi::dpl::__ranges::__require_access(__cgh, __rng); + auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); + + sycl::accessor __dst(__temp_buf, __cgh, sycl::read_write, sycl::no_init); + + __cgh.parallel_for<_DiagonalsKernelName...>( + // +1 doesn't required here, because we need to calculate split points for each base diagonal + // and for the right base diagonal in the last work-group but we can keep it one position to the left + // because we know that for 0-diagonal the split point is { 0, 0 }. + sycl::range(__nd_range_params.base_diag_count /*+ 1*/), [=](sycl::item __item_id) { + + const std::size_t __linear_id = __item_id.get_linear_id(); + + auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + + // We should add `1` to __linear_id here to avoid calculation of split-point for 0-diagonal + const WorkDataArea __data_area(__n, __n_sorted, __linear_id + 1, __nd_range_params.chunk * __nd_range_params.steps_between_two_base_diags); + + _merge_split_point_t __sp{ 0, 0}; + + if (__data_area.is_i_elem_local_inside_merge_matrix()) + { + if (__data_in_temp) + { + DropViews __views(__dst, __data_area); + __sp = __find_start_point_w(__data_area, __views, __comp); + } + else + { + DropViews __views(__rng, __data_area); + __sp = __find_start_point_w(__data_area, __views, __comp); + } + } + + __base_diagonals_sp_global_ptr[__linear_id] = __sp; + }); + }); + } + + template + static _merge_split_point_t + __find_or_eval_sp(const std::size_t __global_idx, + const nd_range_params& __nd_range_params, + const WorkDataArea& __data_area, + const DropViews& __views, + _Compare __comp, + _BaseDiagonalsSPStorage __base_diagonals_sp_global_ptr) + { + _merge_split_point_t __result(0, 0); + + std::size_t __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; + + assert(__diagonal_idx < __nd_range_params.base_diag_count); + + const _merge_split_point_t __sp_left = __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{ 0, 0 }; + const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; + + if (__sp_right.first + __sp_right.second > 0) + { + if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) + { + __result = __find_start_point_in(__views.rng1, __sp_left.first, __sp_right.first, + __views.rng2, __sp_left.second, __sp_right.second, + __data_area.i_elem_local, __comp); + } + else + { + __result = __sp_left; + } + } + else { - __event_chain = __q.submit([&, __event_chain, __n_sorted, __data_in_temp](sycl::handler& __cgh) { - __cgh.depends_on(__event_chain); + __result = __find_start_point_w(__data_area, __views, __comp); + } + + return __result; + } + + // Process parallel merge + template + sycl::event + run_parallel_merge(const sycl::event& __event_chain, + const _IndexT __n_sorted, const bool __data_in_temp, + _ExecutionPolicy&& __exec, _Range&& __rng, _TempBuf& __temp_buf, _Compare __comp, + const nd_range_params& __nd_range_params) const + { + const _IndexT __n = __rng.size(); - oneapi::dpl::__ranges::__require_access(__cgh, __rng); - sycl::accessor __dst(__temp_buf, __cgh, sycl::read_write, sycl::no_init); + return __exec.queue().submit([&, __event_chain](sycl::handler& __cgh) { - __cgh.parallel_for<_GlobalSortName...>( - sycl::range(__steps), [=](sycl::item __item_id) { - const _IndexT __i_elem = __item_id.get_linear_id() * __chunk; - const _IndexT __i_elem_local = __i_elem % (__n_sorted * 2); + __cgh.depends_on(__event_chain); + + oneapi::dpl::__ranges::__require_access(__cgh, __rng); + sycl::accessor __dst(__temp_buf, __cgh, sycl::read_write, sycl::no_init); - const _IndexT __offset = std::min<_IndexT>(__i_elem - __i_elem_local, __n); - const _IndexT __n1 = std::min<_IndexT>(__offset + __n_sorted, __n) - __offset; - const _IndexT __n2 = std::min<_IndexT>(__offset + __n1 + __n_sorted, __n) - (__offset + __n1); + __cgh.parallel_for<_GlobalSortName1...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + const std::size_t __linear_id = __item_id.get_linear_id(); + + const WorkDataArea __data_area(__n, __n_sorted, __linear_id, __nd_range_params.chunk); + if (__data_area.is_i_elem_local_inside_merge_matrix()) + { if (__data_in_temp) { - const oneapi::dpl::__ranges::drop_view_simple __rng1(__dst, __offset); - const oneapi::dpl::__ranges::drop_view_simple __rng2(__dst, __offset + __n1); + DropViews __views(__dst, __data_area); - const auto start = __find_start_point(__rng1, __rng2, __i_elem_local, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng /*__rng3*/, start.first, start.second, __i_elem, - __chunk, __n1, __n2, __comp); + const auto __sp = __find_start_point_w(__data_area, __views, __comp); + __serial_merge_w(__nd_range_params, __data_area, __views, __rng, __sp, __comp); } else { - const oneapi::dpl::__ranges::drop_view_simple __rng1(__rng, __offset); - const oneapi::dpl::__ranges::drop_view_simple __rng2(__rng, __offset + __n1); + DropViews __views(__rng, __data_area); + + const auto __sp = __find_start_point_w(__data_area, __views, __comp); + __serial_merge_w(__nd_range_params, __data_area, __views, __dst, __sp, __comp); + } + } + }); + }); + } + + // Process parallel merge with usage of split-points on base diagonals + template + sycl::event + run_parallel_merge(const sycl::event& __event_chain, + const _IndexT __n_sorted, const bool __data_in_temp, + _ExecutionPolicy&& __exec, _Range&& __rng, _TempBuf& __temp_buf, _Compare __comp, + const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const + { + const _IndexT __n = __rng.size(); + + return __exec.queue().submit([&,__event_chain](sycl::handler& __cgh) { + + __cgh.depends_on(__event_chain); + + oneapi::dpl::__ranges::__require_access(__cgh, __rng); + sycl::accessor __dst(__temp_buf, __cgh, sycl::read_write, sycl::no_init); + + auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + + __cgh.parallel_for<_GlobalSortName2...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + + const std::size_t __linear_id = __item_id.get_linear_id(); + + auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - const auto start = __find_start_point(__rng1, __rng2, __i_elem_local, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __dst /*__rng3*/, start.first, start.second, __i_elem, - __chunk, __n1, __n2, __comp); + const WorkDataArea __data_area(__n, __n_sorted, __linear_id, __nd_range_params.chunk); + + if (__data_area.is_i_elem_local_inside_merge_matrix()) + { + if (__data_in_temp) + { + DropViews __views(__dst, __data_area); + + const auto __sp = __find_or_eval_sp(__linear_id /* __global_idx */, + __nd_range_params, + __data_area, __views, + __comp, + __base_diagonals_sp_global_ptr); + __serial_merge_w(__nd_range_params, __data_area, __views, __rng, __sp, __comp); } - }); - }); + else + { + DropViews __views(__rng, __data_area); + + const auto __sp = __find_or_eval_sp(__linear_id /* __global_idx */, + __nd_range_params, + __data_area, __views, + __comp, + __base_diagonals_sp_global_ptr); + __serial_merge_w(__nd_range_params, __data_area, __views, __dst, __sp, __comp); + } + } + }); + }); + } + +public: + + using __container_of_temp_storages_t = std::vector<__result_and_scratch_storage_base_ptr>; + + template + std::tuple + operator()(_ExecutionPolicy&& __exec, _Range& __rng, _Compare __comp, _LeafSizeT __leaf_size, _TempBuf& __temp_buf, + sycl::event __event_chain) const + { + const _IndexT __n = __rng.size(); + _IndexT __n_sorted = __leaf_size; + + bool __data_in_temp = false; + + // Calculate nd-range params + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n, __n_sorted); + + using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _merge_split_point_t>; + + const std::size_t __n_power2 = oneapi::dpl::__internal::__dpl_bit_ceil(__n); + // ctz precisely calculates log2 of an integral value which is a power of 2, while + // std::log2 may be prone to rounding errors on some architectures + const std::int64_t __n_iter = sycl::ctz(__n_power2) - sycl::ctz(__leaf_size); + + // Create container for storages with split-points on base diagonal + // - each iteration should have their own container + __container_of_temp_storages_t __temp_sp_storages(std::max(__n_iter, (std::int64_t)0)); + + for (std::int64_t __i = 0; __i < __n_iter; ++__i) + { + if (2 * __n_sorted >= __starting_size_limit_for_large_submitter) + { + // Create storage for save split-points on each base diagonal + // - for current iteration + auto __p_base_diagonals_sp_storage = new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params.base_diag_count); + __temp_sp_storages[__i].reset(__p_base_diagonals_sp_storage); + + // Calculation of split-points on each base diagonal + __event_chain = eval_split_points_for_groups(__event_chain, + __n_sorted, __data_in_temp, + __exec, __rng, __temp_buf, __comp, + __nd_range_params, + *__p_base_diagonals_sp_storage); + + // Process parallel merge with usage of split-points on base diagonals + __event_chain = run_parallel_merge(__event_chain, + __n_sorted, __data_in_temp, + __exec, __rng, __temp_buf, __comp, + __nd_range_params, + *__p_base_diagonals_sp_storage); + } + else + { + // Process parallel merge + __event_chain = run_parallel_merge(__event_chain, + __n_sorted, __data_in_temp, + __exec, __rng, __temp_buf, __comp, + __nd_range_params); + } + __n_sorted *= 2; __data_in_temp = !__data_in_temp; } - return {__event_chain, __data_in_temp}; + + return {__event_chain, __data_in_temp, std::move(__temp_sp_storages)}; } }; @@ -303,7 +651,7 @@ struct __merge_sort_copy_back_submitter<__internal::__optional_kernel_name<_Copy sycl::event operator()(sycl::queue& __q, _Range& __rng, _TempBuf& __temp_buf, sycl::event __event_chain) const { - __event_chain = __q.submit([&, __event_chain](sycl::handler& __cgh) { + return __q.submit([&](sycl::handler& __cgh) { __cgh.depends_on(__event_chain); oneapi::dpl::__ranges::__require_access(__cgh, __rng); auto __temp_acc = __temp_buf.template get_access(__cgh); @@ -314,7 +662,6 @@ struct __merge_sort_copy_back_submitter<__internal::__optional_kernel_name<_Copy __rng[__idx] = __temp_acc[__idx]; }); }); - return __event_chain; } }; @@ -322,7 +669,13 @@ template class __sort_leaf_kernel; template -class __sort_global_kernel; +class __diagonals_kernel_name_for_merge_sort; + +template +class __sort_global_kernel1; + +template +class __sort_global_kernel2; template class __sort_copy_back_kernel; @@ -336,8 +689,12 @@ __merge_sort(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSo using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _LeafSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_leaf_kernel<_CustomName>>; - using _GlobalSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __sort_global_kernel<_CustomName, _IndexT>>; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name_for_merge_sort<_CustomName, _IndexT>>; + using _GlobalSortKernel1 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __sort_global_kernel1<_CustomName, _IndexT>>; + using _GlobalSortKernel2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __sort_global_kernel2<_CustomName, _IndexT>>; using _CopyBackKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_copy_back_kernel<_CustomName>>; @@ -353,15 +710,15 @@ __merge_sort(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSo // 2. Merge sorting oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, _Tp> __temp(__exec, __rng.size()); auto __temp_buf = __temp.get_buffer(); - auto [__event_sort, __data_in_temp] = __merge_sort_global_submitter<_IndexT, _GlobalSortKernel>()( - __q, __rng, __comp, __leaf_sorter.__process_size, __temp_buf, __event_leaf_sort); + auto [__event_sort, __data_in_temp, __temp_sp_storages] = __merge_sort_global_submitter<_IndexT, _DiagonalsKernelName, _GlobalSortKernel1, _GlobalSortKernel2>()( + __exec, __rng, __comp, __leaf_sorter.__process_size, __temp_buf, __event_leaf_sort); // 3. If the data remained in the temporary buffer then copy it back if (__data_in_temp) { __event_sort = __merge_sort_copy_back_submitter<_CopyBackKernel>()(__q, __rng, __temp_buf, __event_sort); } - return __future(__event_sort); + return __future(__event_sort, std::move(__temp_sp_storages)); } template From c033585f16f41a8466c62fb62e2565b168eed3d1 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 10:02:16 +0100 Subject: [PATCH 038/144] Revert: Combine two submitters `__parallel_merge_submitter` and `__parallel_merge_submitter_large` into one `__parallel_merge_submitter` (#1956) Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 155 ++++++++++-------- 1 file changed, 91 insertions(+), 64 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index ed4c7df1ded..08ba09098d5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -228,15 +228,53 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } } -template +// Please see the comment for __parallel_for_submitter for optional kernel name explanation +template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, - __internal::__optional_kernel_name<_MergeKernelName1...>, - __internal::__optional_kernel_name<_MergeKernelName2...>> +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + + auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); + }); + // We should return the same thing in the second param of __future for compatibility + // with the returning value in __parallel_merge_submitter_large::operator() + return __future(__event, __result_and_scratch_storage_base_ptr{}); + } +}; + +template +struct __parallel_merge_submitter_large; + +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName...>> { protected: struct nd_range_params @@ -269,9 +307,9 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - const _IdType __base_diag_count = __use_base_diags ? 32 * 1'024 : 0; + const _IdType __base_diag_count = 32 * 1'024; const _IdType __steps_between_two_base_diags = - __use_base_diags ? oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count) : 0; + oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; } @@ -331,7 +369,7 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName1...>( + __cgh.parallel_for<_MergeKernelName...>( sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __chunk; @@ -366,7 +404,7 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k __cgh.depends_on(__event); - __cgh.parallel_for<_MergeKernelName2...>( + __cgh.parallel_for<_MergeKernelName...>( sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __nd_range_params.chunk; @@ -398,8 +436,6 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k } public: - __parallel_merge_submitter(bool __use_base_diags) : __use_base_diags(__use_base_diags) {} - template auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const @@ -413,42 +449,29 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; - // Calculation of split points on each base diagonal - sycl::event __event; - if (__use_base_diags) - { - // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - auto __p_base_diagonals_sp_global_storage = - new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( - __exec, 0, __nd_range_params.base_diag_count + 1); - __p_result_and_scratch_storage_base.reset( - static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - - __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - } - else - { - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__exec, __rng1, __rng2, __rng3, __comp, __nd_range_params); - } + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + auto __p_base_diagonals_sp_global_storage = + new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( + __exec, 0, __nd_range_params.base_diag_count + 1); + __p_result_and_scratch_storage_base.reset( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); } - - private: - const bool __use_base_diags = false; }; template -class __merge_kernel_name1; +class __merge_kernel_name; template -class __merge_kernel_name2; +class __merge_kernel_name_large; template class __diagonals_kernel_name; @@ -460,38 +483,42 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - const std::size_t __n = __rng1.size() + __rng2.size(); - constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 MB - const bool __use_base_diags = __n >= __starting_size_limit_for_large_submitter; - if (__n <= std::numeric_limits::max()) + const std::size_t __n = __rng1.size() + __rng2.size(); + if (__n < __starting_size_limit_for_large_submitter) { using _WiIndex = std::uint32_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName1 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name1<_CustomName, _WiIndex>>; - using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name2<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, - _MergeKernelName2>(__use_base_diags)( + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } else { - using _WiIndex = std::uint64_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName1 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name1<_CustomName, _WiIndex>>; - using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name2<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, - _MergeKernelName2>(__use_base_diags)( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } } } From a06ac54f4a97195a477e8694ee9e834618d4119b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 10:05:15 +0100 Subject: [PATCH 039/144] Call __find_start_point_in instead of __find_start_point in the __parallel_merge_submitter_large::run_parallel_merge Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 08ba09098d5..56b60cb6bd2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -376,7 +376,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (__i_elem < __n1 + __n2) { - _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + _split_point_t<_IdType> __start = __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, __comp); } From c96cccfbe47c3196ce4e0d79651cbce24bb6b763 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 10:25:45 +0100 Subject: [PATCH 040/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: I would use std::pair<_Index> directly here. Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 56b60cb6bd2..8d0c4527d6e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -174,7 +174,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn return __zero_or_one < kValue; }); - return {*__res, __index_sum - *__res + 1}; + return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; } // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing From 2d8f480c0ea5681f9a5a36775ab0b722be3346ad Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 11:15:38 +0100 Subject: [PATCH 041/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix performance degradation for 8 Mb int type Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 1066 +++++++++-------- 1 file changed, 537 insertions(+), 529 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 8d0c4527d6e..895253a151e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -1,529 +1,537 @@ -// -*- C++ -*- -//===-- parallel_backend_sycl_merge.h --------------------------------===// -// -// Copyright (C) Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// This file incorporates work covered by the following copyright and permission -// notice: -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H -#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H - -#include // std::numeric_limits -#include // assert -#include // std::uint8_t, ... -#include // std::make_pair, std::forward -#include // std::min, std::lower_bound - -#include "sycl_defs.h" -#include "parallel_backend_sycl_utils.h" - -namespace oneapi -{ -namespace dpl -{ -namespace __par_backend_hetero -{ -template -using _split_point_t = std::pair<_Index, _Index>; - -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -auto -__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, - const _Index __n2, _Compare __comp) -{ - //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] - oneapi::dpl::counting_iterator<_Index> __diag_it(0); - - if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed - { - const _Index __q = __i_elem; //diagonal index - const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(*__res, __q - *__res); - } - else - { - const _Index __q = __i_elem - __n2; //diagonal index - const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(__q + *__res, __n2 - *__res); - } -} - -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -_split_point_t<_Index> -__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, - const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) -{ - // ----------------------- EXAMPLE ------------------------ - // Let's consider the following input data: - // rng1.size() = 10 - // rng2.size() = 6 - // i_diag = 9 - // Let's define the following ranges for processing: - // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 - // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 - // - // The goal: required to process only X' items of the merge matrix - // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) - // - // -------------------------------------------------------- - // - // __diag_it_begin(rng1) __diag_it_end(rng1) - // (init state) (dest state) (init state, dest state) - // | | | - // V V V - // + + + + + + - // \ rng1 0 1 2 3 4 5 6 7 8 9 - // rng2 +--------------------------------------+ - // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) - // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) - // + 2 | <----------------- + X'1 | | - // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) - // 4 | X ^ | | - // 5 | X | | | <--- __diag_it_begin(rng2) (init state) - // +-------AX-----------+-----------+-----+ - // AX | | - // AX | | - // Run lower_bound:[from = 5, to = 8) - // - // AX - absent items in rng2 - // - // We have three points on diagonal for call comparison: - // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 - // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 - // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 - // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 - - //////////////////////////////////////////////////////////////////////////////////// - // Taking into account the specified constraints of the range of processed data - const auto __index_sum = __i_elem - 1; - - using _IndexSigned = std::make_signed_t<_Index>; - - _IndexSigned idx1_from = __rng1_from; - _IndexSigned idx1_to = __rng1_to; - - _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); - _IndexSigned idx2_to = __index_sum - __rng1_from + 1; - - const _IndexSigned idx2_from_diff = - idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; - const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; - - idx1_to -= idx2_from_diff; - idx1_from += idx2_to_diff; - - idx2_from = __index_sum - (idx1_to - 1); - idx2_to = __index_sum - idx1_from + 1; - - //////////////////////////////////////////////////////////////////////////////////// - // Run search of split point on diagonal - - using __it_t = oneapi::dpl::counting_iterator<_Index>; - - __it_t __diag_it_begin(idx1_from); - __it_t __diag_it_end(idx1_to); - - constexpr int kValue = 1; - const __it_t __res = - std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { - const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); - return __zero_or_one < kValue; - }); - - return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; -} - -// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing -// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) -template -void -__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2, - const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp) -{ - if (__start1 >= __n1) - { - //copying a residual of the second seq - const _Index __n = std::min<_Index>(__n2 - __start2, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng2[__start2 + __i]; - } - else if (__start2 >= __n2) - { - //copying a residual of the first seq - const _Index __n = std::min<_Index>(__n1 - __start1, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng1[__start1 + __i]; - } - else - { - for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) - { - const auto& __val1 = __rng1[__start1]; - const auto& __val2 = __rng2[__start2]; - if (__comp(__val2, __val1)) - { - __rng3[__start3 + __i] = __val2; - if (++__start2 == __n2) - { - //copying a residual of the first seq - for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1) - __rng3[__start3 + __i] = __rng1[__start1]; - } - } - else - { - __rng3[__start3 + __i] = __val1; - if (++__start1 == __n1) - { - //copying a residual of the second seq - for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2) - __rng3[__start3 + __i] = __rng2[__start2]; - } - } - } - } -} - -// Please see the comment for __parallel_for_submitter for optional kernel name explanation -template -struct __parallel_merge_submitter; - -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> -{ - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - assert(__n1 > 0 || __n2 > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - - auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); - }); - // We should return the same thing in the second param of __future for compatibility - // with the returning value in __parallel_merge_submitter_large::operator() - return __future(__event, __result_and_scratch_storage_base_ptr{}); - } -}; - -template -struct __parallel_merge_submitter_large; - -template -struct __parallel_merge_submitter_large<_IdType, _CustomName, - __internal::__optional_kernel_name<_DiagonalsKernelName...>, - __internal::__optional_kernel_name<_MergeKernelName...>> -{ - protected: - struct nd_range_params - { - std::size_t base_diag_count = 0; - std::size_t steps_between_two_base_diags = 0; - std::uint8_t chunk = 0; - _IdType steps = 0; - }; - - // Calculate nd-range params - template - nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const - { - using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; - using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; - using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), - _Range1ValueType, _Range2ValueType>; - - const std::size_t __n = __rng1.size() + __rng2.size(); - - constexpr std::size_t __slm_bank_size = 16; // TODO is it correct value? How to get it from hardware? - - // Calculate how many data items we can read into one SLM bank - constexpr std::size_t __data_items_in_slm_bank = - oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - const _IdType __base_diag_count = 32 * 1'024; - const _IdType __steps_between_two_base_diags = - oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - - return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; - } - - // Calculation of split points on each base diagonal - template - sycl::event - eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, - const nd_range_params& __nd_range_params, - _Storage& __base_diagonals_sp_global_storage) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); - auto __base_diagonals_sp_global_acc = - __base_diagonals_sp_global_storage.template __get_scratch_acc( - __cgh, __dpl_sycl::__no_init{}); - - __cgh.parallel_for<_DiagonalsKernelName...>( - sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - auto __base_diagonals_sp_global_ptr = - _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - - _split_point_t<_IdType> __sp = - __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; - - if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) - { - const _IdType __i_elem = - __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; - if (__i_elem < __n) - __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - } - - __base_diagonals_sp_global_ptr[__global_idx] = __sp; - }); - }); - - return __event; - } - - // Process parallel merge - template - sycl::event - run_parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, - const nd_range_params& __nd_range_params) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - - const auto __chunk = __nd_range_params.chunk; - - sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __chunk; - - if (__i_elem < __n1 + __n2) - { - _split_point_t<_IdType> __start = __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - } - }); - }); - - return __event; - } - - // Process parallel merge - template - sycl::event - run_parallel_merge(sycl::event __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, - _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, - const _Storage& __base_diagonals_sp_global_storage) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - - __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - auto __base_diagonals_sp_global_acc = - __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); - - __cgh.depends_on(__event); - - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __nd_range_params.chunk; - - auto __base_diagonals_sp_global_ptr = - _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; - - _split_point_t<_IdType> __start; - if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) - { - const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; - const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; - - __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, - __sp_left.second, __sp_right.second, __i_elem, __comp); - } - else - { - __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; - } - - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, - __nd_range_params.chunk, __n1, __n2, __comp); - }); - }); - - return __event; - } - - public: - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - assert(__rng1.size() > 0 || __rng2.size() > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Calculate nd-range params - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); - - __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; - - // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - auto __p_base_diagonals_sp_global_storage = - new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( - __exec, 0, __nd_range_params.base_diag_count + 1); - __p_result_and_scratch_storage_base.reset( - static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); - } -}; - -template -class __merge_kernel_name; - -template -class __merge_kernel_name_large; - -template -class __diagonals_kernel_name; - -template -auto -__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, - _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) -{ - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - - constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 MB - - const std::size_t __n = __rng1.size() + __rng2.size(); - if (__n < __starting_size_limit_for_large_submitter) - { - using _WiIndex = std::uint32_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - if (__n <= std::numeric_limits::max()) - { - using _WiIndex = std::uint32_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - using _WiIndex = std::uint64_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - } -} - -} // namespace __par_backend_hetero -} // namespace dpl -} // namespace oneapi - -#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H +// -*- C++ -*- +//===-- parallel_backend_sycl_merge.h --------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H +#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H + +#include // std::numeric_limits +#include // assert +#include // std::uint8_t, ... +#include // std::make_pair, std::forward +#include // std::min, std::lower_bound + +#include "sycl_defs.h" +#include "parallel_backend_sycl_utils.h" + +namespace oneapi +{ +namespace dpl +{ +namespace __par_backend_hetero +{ +template +using _split_point_t = std::pair<_Index, _Index>; + +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +auto +__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, + const _Index __n2, _Compare __comp) +{ + //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] + oneapi::dpl::counting_iterator<_Index> __diag_it(0); + + if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed + { + const _Index __q = __i_elem; //diagonal index + const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(*__res, __q - *__res); + } + else + { + const _Index __q = __i_elem - __n2; //diagonal index + const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(__q + *__res, __n2 - *__res); + } +} + +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +_split_point_t<_Index> +__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, + const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) +{ + // ----------------------- EXAMPLE ------------------------ + // Let's consider the following input data: + // rng1.size() = 10 + // rng2.size() = 6 + // i_diag = 9 + // Let's define the following ranges for processing: + // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 + // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 + // + // The goal: required to process only X' items of the merge matrix + // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) + // + // -------------------------------------------------------- + // + // __diag_it_begin(rng1) __diag_it_end(rng1) + // (init state) (dest state) (init state, dest state) + // | | | + // V V V + // + + + + + + + // \ rng1 0 1 2 3 4 5 6 7 8 9 + // rng2 +--------------------------------------+ + // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) + // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) + // + 2 | <----------------- + X'1 | | + // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) + // 4 | X ^ | | + // 5 | X | | | <--- __diag_it_begin(rng2) (init state) + // +-------AX-----------+-----------+-----+ + // AX | | + // AX | | + // Run lower_bound:[from = 5, to = 8) + // + // AX - absent items in rng2 + // + // We have three points on diagonal for call comparison: + // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 + // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 + // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 + // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 + + //////////////////////////////////////////////////////////////////////////////////// + // Taking into account the specified constraints of the range of processed data + const auto __index_sum = __i_elem - 1; + + using _IndexSigned = std::make_signed_t<_Index>; + + _IndexSigned idx1_from = __rng1_from; + _IndexSigned idx1_to = __rng1_to; + + _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); + _IndexSigned idx2_to = __index_sum - __rng1_from + 1; + + const _IndexSigned idx2_from_diff = + idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; + + idx1_to -= idx2_from_diff; + idx1_from += idx2_to_diff; + + idx2_from = __index_sum - (idx1_to - 1); + idx2_to = __index_sum - idx1_from + 1; + + //////////////////////////////////////////////////////////////////////////////////// + // Run search of split point on diagonal + + using __it_t = oneapi::dpl::counting_iterator<_Index>; + + __it_t __diag_it_begin(idx1_from); + __it_t __diag_it_end(idx1_to); + + constexpr int kValue = 1; + const __it_t __res = + std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { + const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); + return __zero_or_one < kValue; + }); + + return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; +} + +// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing +// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) +template +void +__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2, + const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp) +{ + if (__start1 >= __n1) + { + //copying a residual of the second seq + const _Index __n = std::min<_Index>(__n2 - __start2, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng2[__start2 + __i]; + } + else if (__start2 >= __n2) + { + //copying a residual of the first seq + const _Index __n = std::min<_Index>(__n1 - __start1, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng1[__start1 + __i]; + } + else + { + for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) + { + const auto& __val1 = __rng1[__start1]; + const auto& __val2 = __rng2[__start2]; + if (__comp(__val2, __val1)) + { + __rng3[__start3 + __i] = __val2; + if (++__start2 == __n2) + { + //copying a residual of the first seq + for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1) + __rng3[__start3 + __i] = __rng1[__start1]; + } + } + else + { + __rng3[__start3 + __i] = __val1; + if (++__start1 == __n1) + { + //copying a residual of the second seq + for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2) + __rng3[__start3 + __i] = __rng2[__start2]; + } + } + } + } +} + +// Please see the comment for __parallel_for_submitter for optional kernel name explanation +template +struct __parallel_merge_submitter; + +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + + auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); + }); + // We should return the same thing in the second param of __future for compatibility + // with the returning value in __parallel_merge_submitter_large::operator() + return __future(__event, __result_and_scratch_storage_base_ptr{}); + } +}; + +template +struct __parallel_merge_submitter_large; + +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName...>> +{ + protected: + struct nd_range_params + { + std::size_t base_diag_count = 0; + std::size_t steps_between_two_base_diags = 0; + std::uint8_t chunk = 0; + _IdType steps = 0; + }; + + // Calculate nd-range params + template + nd_range_params + eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const + { + using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), + _Range1ValueType, _Range2ValueType>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + const _IdType __base_diag_count = 32 * 1'024; + const _IdType __steps_between_two_base_diags = + oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + + return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; + } + + // Calculation of split points on each base diagonal + template + sycl::event + eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, + const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); + + __cgh.parallel_for<_DiagonalsKernelName...>( + sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + + _split_point_t<_IdType> __sp = + __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; + + if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) + { + const _IdType __i_elem = + __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; + if (__i_elem < __n) + __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + } + + __base_diagonals_sp_global_ptr[__global_idx] = __sp; + }); + }); + + return __event; + } + + // Process parallel merge + template + sycl::event + run_parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, + const nd_range_params& __nd_range_params) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + + const auto __chunk = __nd_range_params.chunk; + + sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __chunk; + + if (__i_elem < __n1 + __n2) + { + _split_point_t<_IdType> __start = __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + } + }); + }); + + return __event; + } + + // Process parallel merge + template + sycl::event + run_parallel_merge(sycl::event __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, + _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, + const _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + + __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + + __cgh.depends_on(__event); + + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __nd_range_params.chunk; + + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; + + _split_point_t<_IdType> __start; + if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) + { + const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; + + __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, + __sp_left.second, __sp_right.second, __i_elem, __comp); + } + else + { + __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; + } + + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, + __nd_range_params.chunk, __n1, __n2, __comp); + }); + }); + + return __event; + } + + public: + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + assert(__rng1.size() > 0 || __rng2.size() > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Calculate nd-range params + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); + + __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; + + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + auto __p_base_diagonals_sp_global_storage = + new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( + __exec, 0, __nd_range_params.base_diag_count + 1); + __p_result_and_scratch_storage_base.reset( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); + } +}; + +template +class __merge_kernel_name; + +template +class __merge_kernel_name_large; + +template +class __diagonals_kernel_name; + +template +std::size_t +starting_size_limit_for_large_submitter() +{ + return 4 * 1'048'576; // 4 MB +} + +template <> +std::size_t +starting_size_limit_for_large_submitter() +{ + return 16 * 1'048'576; // 8 MB +} + +template +auto +__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, + _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) +{ + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + if (__n < starting_size_limit_for_large_submitter<__value_type>()) + { + using _WiIndex = std::uint32_t; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + } +} + +} // namespace __par_backend_hetero +} // namespace dpl +} // namespace oneapi + +#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H From 05ff60fe2fb23638af569c52f8bccd60512af1a6 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 11:28:43 +0100 Subject: [PATCH 042/144] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 895253a151e..f6c7f7a3d5c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -370,9 +370,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (__i_elem < __n1 + __n2) { - _split_point_t<_IdType> __start = __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); + _split_point_t<_IdType> __start = + __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, + __n2, __comp); } }); }); @@ -451,11 +452,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); + *__p_base_diagonals_sp_global_storage); // Merge data using split points on each base diagonal __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); + *__p_base_diagonals_sp_global_storage); return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); } From ea47019fd4af943b33ced96ea0976e6474ede0fa Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 15:13:41 +0100 Subject: [PATCH 043/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove run_parallel_merge with old implementation Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index f6c7f7a3d5c..658598af4db 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -349,38 +349,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, return __event; } - // Process parallel merge - template - sycl::event - run_parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, - const nd_range_params& __nd_range_params) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - - const auto __chunk = __nd_range_params.chunk; - - sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __chunk; - - if (__i_elem < __n1 + __n2) - { - _split_point_t<_IdType> __start = - __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, - __n2, __comp); - } - }); - }); - - return __event; - } - // Process parallel merge template From 73bbc141cf0ccaf69e7bd949b532708dbef2748f Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 15:32:12 +0100 Subject: [PATCH 044/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix self-review comment Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 658598af4db..6b19d748400 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -440,15 +440,15 @@ template class __diagonals_kernel_name; template -std::size_t -starting_size_limit_for_large_submitter() +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() { return 4 * 1'048'576; // 4 MB } template <> -std::size_t -starting_size_limit_for_large_submitter() +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() { return 16 * 1'048'576; // 8 MB } @@ -463,9 +463,10 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; const std::size_t __n = __rng1.size() + __rng2.size(); - if (__n < starting_size_limit_for_large_submitter<__value_type>()) + if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) { using _WiIndex = std::uint32_t; + static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= std::numeric_limits<_WiIndex>::max()); using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( From e0c1628adc625adfa59a5bcc3cb8bb1f79615700 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 16:24:42 +0100 Subject: [PATCH 045/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix self-review comments Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 6b19d748400..2a9221595e4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -169,7 +169,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn constexpr int kValue = 1; const __it_t __res = - std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { + std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); return __zero_or_one < kValue; }); @@ -252,7 +252,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { + auto __event = __exec.queue().submit([&__rng1, &__rng2, &__rng3, __steps, __chunk, __n1, __n2, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); __cgh.parallel_for<_MergeKernelName...>( sycl::range(__steps), [=](sycl::item __item_id) { @@ -319,7 +319,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; - sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + return __exec.queue().submit([&__rng1, &__rng2, __base_diagonals_sp_global_storage, __n1, __n2, __n, __nd_range_params, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( @@ -345,22 +345,20 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __base_diagonals_sp_global_ptr[__global_idx] = __sp; }); }); - - return __event; } // Process parallel merge template sycl::event - run_parallel_merge(sycl::event __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, + run_parallel_merge(const sycl::event& __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, const _Storage& __base_diagonals_sp_global_storage) const { const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); - __event = __exec.queue().submit([&](sycl::handler& __cgh) { + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __nd_range_params, __base_diagonals_sp_global_storage, __n1, __n2, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); @@ -394,8 +392,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __nd_range_params.chunk, __n1, __n2, __comp); }); }); - - return __event; } public: From 38166c712a1551593b0015f064e3ef9417edcd42 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 17 Dec 2024 09:44:08 +0100 Subject: [PATCH 046/144] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 2a9221595e4..6158b32dcf9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -168,11 +168,12 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn __it_t __diag_it_end(idx1_to); constexpr int kValue = 1; - const __it_t __res = - std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { - const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); - return __zero_or_one < kValue; - }); + const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, + [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { + const auto __zero_or_one = + __comp(__rng2[__index_sum - __idx], __rng1[__idx]); + return __zero_or_one < kValue; + }); return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; } @@ -252,16 +253,17 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - auto __event = __exec.queue().submit([&__rng1, &__rng2, &__rng3, __steps, __chunk, __n1, __n2, __comp](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); - }); + auto __event = __exec.queue().submit( + [&__rng1, &__rng2, &__rng3, __steps, __chunk, __n1, __n2, __comp](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, + __n2, __comp); + }); + }); // We should return the same thing in the second param of __future for compatibility // with the returning value in __parallel_merge_submitter_large::operator() return __future(__event, __result_and_scratch_storage_base_ptr{}); @@ -319,7 +321,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; - return __exec.queue().submit([&__rng1, &__rng2, __base_diagonals_sp_global_storage, __n1, __n2, __n, __nd_range_params, __comp](sycl::handler& __cgh) { + return __exec.queue().submit([&__rng1, &__rng2, __base_diagonals_sp_global_storage, __n1, __n2, __n, + __nd_range_params, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( @@ -358,7 +361,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); - return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __nd_range_params, __base_diagonals_sp_global_storage, __n1, __n2, __comp](sycl::handler& __cgh) { + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __nd_range_params, + __base_diagonals_sp_global_storage, __n1, __n2, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); @@ -462,7 +466,8 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) { using _WiIndex = std::uint32_t; - static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= std::numeric_limits<_WiIndex>::max()); + static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= + std::numeric_limits<_WiIndex>::max()); using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( From 7b5dc422a98a87d067066cf326ff5e0b5115494b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 17 Dec 2024 12:20:49 +0100 Subject: [PATCH 047/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix ordering of captured variables in submit calls Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 6158b32dcf9..9ead310b63c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -254,7 +254,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); auto __event = __exec.queue().submit( - [&__rng1, &__rng2, &__rng3, __steps, __chunk, __n1, __n2, __comp](sycl::handler& __cgh) { + [&__rng1, &__rng2, &__rng3, __comp, __chunk, __steps, __n1, __n2](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); __cgh.parallel_for<_MergeKernelName...>( sycl::range(__steps), [=](sycl::item __item_id) { @@ -321,8 +321,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; - return __exec.queue().submit([&__rng1, &__rng2, __base_diagonals_sp_global_storage, __n1, __n2, __n, - __nd_range_params, __comp](sycl::handler& __cgh) { + return __exec.queue().submit([&__rng1, &__rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage, + __n1, __n2, __n](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( @@ -361,8 +361,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); - return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __nd_range_params, - __base_diagonals_sp_global_storage, __n1, __n2, __comp](sycl::handler& __cgh) { + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __comp, __nd_range_params, + __base_diagonals_sp_global_storage, __n1, __n2](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); From 93c731ac6564a1a93d54eda5807a0649c69d31f6 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 18 Dec 2024 16:50:26 +0100 Subject: [PATCH 048/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix EOL chars Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 1016 ++++++++--------- 1 file changed, 508 insertions(+), 508 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9ead310b63c..10d4c5e7489 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -1,508 +1,508 @@ -// -*- C++ -*- -//===-- parallel_backend_sycl_merge.h --------------------------------===// -// -// Copyright (C) Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// This file incorporates work covered by the following copyright and permission -// notice: -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H -#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H - -#include // std::numeric_limits -#include // assert -#include // std::uint8_t, ... -#include // std::make_pair, std::forward -#include // std::min, std::lower_bound - -#include "sycl_defs.h" -#include "parallel_backend_sycl_utils.h" - -namespace oneapi -{ -namespace dpl -{ -namespace __par_backend_hetero -{ -template -using _split_point_t = std::pair<_Index, _Index>; - -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -auto -__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, - const _Index __n2, _Compare __comp) -{ - //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] - oneapi::dpl::counting_iterator<_Index> __diag_it(0); - - if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed - { - const _Index __q = __i_elem; //diagonal index - const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(*__res, __q - *__res); - } - else - { - const _Index __q = __i_elem - __n2; //diagonal index - const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(__q + *__res, __n2 - *__res); - } -} - -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -_split_point_t<_Index> -__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, - const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) -{ - // ----------------------- EXAMPLE ------------------------ - // Let's consider the following input data: - // rng1.size() = 10 - // rng2.size() = 6 - // i_diag = 9 - // Let's define the following ranges for processing: - // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 - // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 - // - // The goal: required to process only X' items of the merge matrix - // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) - // - // -------------------------------------------------------- - // - // __diag_it_begin(rng1) __diag_it_end(rng1) - // (init state) (dest state) (init state, dest state) - // | | | - // V V V - // + + + + + + - // \ rng1 0 1 2 3 4 5 6 7 8 9 - // rng2 +--------------------------------------+ - // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) - // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) - // + 2 | <----------------- + X'1 | | - // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) - // 4 | X ^ | | - // 5 | X | | | <--- __diag_it_begin(rng2) (init state) - // +-------AX-----------+-----------+-----+ - // AX | | - // AX | | - // Run lower_bound:[from = 5, to = 8) - // - // AX - absent items in rng2 - // - // We have three points on diagonal for call comparison: - // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 - // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 - // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 - // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 - - //////////////////////////////////////////////////////////////////////////////////// - // Taking into account the specified constraints of the range of processed data - const auto __index_sum = __i_elem - 1; - - using _IndexSigned = std::make_signed_t<_Index>; - - _IndexSigned idx1_from = __rng1_from; - _IndexSigned idx1_to = __rng1_to; - - _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); - _IndexSigned idx2_to = __index_sum - __rng1_from + 1; - - const _IndexSigned idx2_from_diff = - idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; - const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; - - idx1_to -= idx2_from_diff; - idx1_from += idx2_to_diff; - - idx2_from = __index_sum - (idx1_to - 1); - idx2_to = __index_sum - idx1_from + 1; - - //////////////////////////////////////////////////////////////////////////////////// - // Run search of split point on diagonal - - using __it_t = oneapi::dpl::counting_iterator<_Index>; - - __it_t __diag_it_begin(idx1_from); - __it_t __diag_it_end(idx1_to); - - constexpr int kValue = 1; - const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, - [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { - const auto __zero_or_one = - __comp(__rng2[__index_sum - __idx], __rng1[__idx]); - return __zero_or_one < kValue; - }); - - return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; -} - -// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing -// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) -template -void -__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2, - const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp) -{ - if (__start1 >= __n1) - { - //copying a residual of the second seq - const _Index __n = std::min<_Index>(__n2 - __start2, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng2[__start2 + __i]; - } - else if (__start2 >= __n2) - { - //copying a residual of the first seq - const _Index __n = std::min<_Index>(__n1 - __start1, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng1[__start1 + __i]; - } - else - { - for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) - { - const auto& __val1 = __rng1[__start1]; - const auto& __val2 = __rng2[__start2]; - if (__comp(__val2, __val1)) - { - __rng3[__start3 + __i] = __val2; - if (++__start2 == __n2) - { - //copying a residual of the first seq - for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1) - __rng3[__start3 + __i] = __rng1[__start1]; - } - } - else - { - __rng3[__start3 + __i] = __val1; - if (++__start1 == __n1) - { - //copying a residual of the second seq - for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2) - __rng3[__start3 + __i] = __rng2[__start2]; - } - } - } - } -} - -// Please see the comment for __parallel_for_submitter for optional kernel name explanation -template -struct __parallel_merge_submitter; - -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> -{ - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - assert(__n1 > 0 || __n2 > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - - auto __event = __exec.queue().submit( - [&__rng1, &__rng2, &__rng3, __comp, __chunk, __steps, __n1, __n2](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, - __n2, __comp); - }); - }); - // We should return the same thing in the second param of __future for compatibility - // with the returning value in __parallel_merge_submitter_large::operator() - return __future(__event, __result_and_scratch_storage_base_ptr{}); - } -}; - -template -struct __parallel_merge_submitter_large; - -template -struct __parallel_merge_submitter_large<_IdType, _CustomName, - __internal::__optional_kernel_name<_DiagonalsKernelName...>, - __internal::__optional_kernel_name<_MergeKernelName...>> -{ - protected: - struct nd_range_params - { - std::size_t base_diag_count = 0; - std::size_t steps_between_two_base_diags = 0; - std::uint8_t chunk = 0; - _IdType steps = 0; - }; - - // Calculate nd-range params - template - nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const - { - using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; - using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; - using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), - _Range1ValueType, _Range2ValueType>; - - const std::size_t __n = __rng1.size() + __rng2.size(); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - const _IdType __base_diag_count = 32 * 1'024; - const _IdType __steps_between_two_base_diags = - oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - - return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; - } - - // Calculation of split points on each base diagonal - template - sycl::event - eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, - const nd_range_params& __nd_range_params, - _Storage& __base_diagonals_sp_global_storage) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - return __exec.queue().submit([&__rng1, &__rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage, - __n1, __n2, __n](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); - auto __base_diagonals_sp_global_acc = - __base_diagonals_sp_global_storage.template __get_scratch_acc( - __cgh, __dpl_sycl::__no_init{}); - - __cgh.parallel_for<_DiagonalsKernelName...>( - sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - auto __base_diagonals_sp_global_ptr = - _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - - _split_point_t<_IdType> __sp = - __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; - - if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) - { - const _IdType __i_elem = - __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; - if (__i_elem < __n) - __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - } - - __base_diagonals_sp_global_ptr[__global_idx] = __sp; - }); - }); - } - - // Process parallel merge - template - sycl::event - run_parallel_merge(const sycl::event& __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, - _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, - const _Storage& __base_diagonals_sp_global_storage) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - - return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __comp, __nd_range_params, - __base_diagonals_sp_global_storage, __n1, __n2](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - auto __base_diagonals_sp_global_acc = - __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); - - __cgh.depends_on(__event); - - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __nd_range_params.chunk; - - auto __base_diagonals_sp_global_ptr = - _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; - - _split_point_t<_IdType> __start; - if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) - { - const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; - const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; - - __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, - __sp_left.second, __sp_right.second, __i_elem, __comp); - } - else - { - __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; - } - - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, - __nd_range_params.chunk, __n1, __n2, __comp); - }); - }); - } - - public: - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - assert(__rng1.size() > 0 || __rng2.size() > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Calculate nd-range params - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); - - __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; - - // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - auto __p_base_diagonals_sp_global_storage = - new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( - __exec, 0, __nd_range_params.base_diag_count + 1); - __p_result_and_scratch_storage_base.reset( - static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); - } -}; - -template -class __merge_kernel_name; - -template -class __merge_kernel_name_large; - -template -class __diagonals_kernel_name; - -template -constexpr std::size_t -__get_starting_size_limit_for_large_submitter() -{ - return 4 * 1'048'576; // 4 MB -} - -template <> -constexpr std::size_t -__get_starting_size_limit_for_large_submitter() -{ - return 16 * 1'048'576; // 8 MB -} - -template -auto -__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, - _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) -{ - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - - using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; - - const std::size_t __n = __rng1.size() + __rng2.size(); - if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) - { - using _WiIndex = std::uint32_t; - static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= - std::numeric_limits<_WiIndex>::max()); - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - if (__n <= std::numeric_limits::max()) - { - using _WiIndex = std::uint32_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - using _WiIndex = std::uint64_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - } -} - -} // namespace __par_backend_hetero -} // namespace dpl -} // namespace oneapi - -#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H +// -*- C++ -*- +//===-- parallel_backend_sycl_merge.h --------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H +#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H + +#include // std::numeric_limits +#include // assert +#include // std::uint8_t, ... +#include // std::make_pair, std::forward +#include // std::min, std::lower_bound + +#include "sycl_defs.h" +#include "parallel_backend_sycl_utils.h" + +namespace oneapi +{ +namespace dpl +{ +namespace __par_backend_hetero +{ +template +using _split_point_t = std::pair<_Index, _Index>; + +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +auto +__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, + const _Index __n2, _Compare __comp) +{ + //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] + oneapi::dpl::counting_iterator<_Index> __diag_it(0); + + if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed + { + const _Index __q = __i_elem; //diagonal index + const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(*__res, __q - *__res); + } + else + { + const _Index __q = __i_elem - __n2; //diagonal index + const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(__q + *__res, __n2 - *__res); + } +} + +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +_split_point_t<_Index> +__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, + const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) +{ + // ----------------------- EXAMPLE ------------------------ + // Let's consider the following input data: + // rng1.size() = 10 + // rng2.size() = 6 + // i_diag = 9 + // Let's define the following ranges for processing: + // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 + // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 + // + // The goal: required to process only X' items of the merge matrix + // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) + // + // -------------------------------------------------------- + // + // __diag_it_begin(rng1) __diag_it_end(rng1) + // (init state) (dest state) (init state, dest state) + // | | | + // V V V + // + + + + + + + // \ rng1 0 1 2 3 4 5 6 7 8 9 + // rng2 +--------------------------------------+ + // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) + // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) + // + 2 | <----------------- + X'1 | | + // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) + // 4 | X ^ | | + // 5 | X | | | <--- __diag_it_begin(rng2) (init state) + // +-------AX-----------+-----------+-----+ + // AX | | + // AX | | + // Run lower_bound:[from = 5, to = 8) + // + // AX - absent items in rng2 + // + // We have three points on diagonal for call comparison: + // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 + // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 + // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 + // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 + + //////////////////////////////////////////////////////////////////////////////////// + // Taking into account the specified constraints of the range of processed data + const auto __index_sum = __i_elem - 1; + + using _IndexSigned = std::make_signed_t<_Index>; + + _IndexSigned idx1_from = __rng1_from; + _IndexSigned idx1_to = __rng1_to; + + _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); + _IndexSigned idx2_to = __index_sum - __rng1_from + 1; + + const _IndexSigned idx2_from_diff = + idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; + + idx1_to -= idx2_from_diff; + idx1_from += idx2_to_diff; + + idx2_from = __index_sum - (idx1_to - 1); + idx2_to = __index_sum - idx1_from + 1; + + //////////////////////////////////////////////////////////////////////////////////// + // Run search of split point on diagonal + + using __it_t = oneapi::dpl::counting_iterator<_Index>; + + __it_t __diag_it_begin(idx1_from); + __it_t __diag_it_end(idx1_to); + + constexpr int kValue = 1; + const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, + [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { + const auto __zero_or_one = + __comp(__rng2[__index_sum - __idx], __rng1[__idx]); + return __zero_or_one < kValue; + }); + + return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; +} + +// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing +// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) +template +void +__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2, + const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp) +{ + if (__start1 >= __n1) + { + //copying a residual of the second seq + const _Index __n = std::min<_Index>(__n2 - __start2, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng2[__start2 + __i]; + } + else if (__start2 >= __n2) + { + //copying a residual of the first seq + const _Index __n = std::min<_Index>(__n1 - __start1, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng1[__start1 + __i]; + } + else + { + for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) + { + const auto& __val1 = __rng1[__start1]; + const auto& __val2 = __rng2[__start2]; + if (__comp(__val2, __val1)) + { + __rng3[__start3 + __i] = __val2; + if (++__start2 == __n2) + { + //copying a residual of the first seq + for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1) + __rng3[__start3 + __i] = __rng1[__start1]; + } + } + else + { + __rng3[__start3 + __i] = __val1; + if (++__start1 == __n1) + { + //copying a residual of the second seq + for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2) + __rng3[__start3 + __i] = __rng2[__start2]; + } + } + } + } +} + +// Please see the comment for __parallel_for_submitter for optional kernel name explanation +template +struct __parallel_merge_submitter; + +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + + auto __event = __exec.queue().submit( + [&__rng1, &__rng2, &__rng3, __comp, __chunk, __steps, __n1, __n2](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, + __n2, __comp); + }); + }); + // We should return the same thing in the second param of __future for compatibility + // with the returning value in __parallel_merge_submitter_large::operator() + return __future(__event, __result_and_scratch_storage_base_ptr{}); + } +}; + +template +struct __parallel_merge_submitter_large; + +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName...>> +{ + protected: + struct nd_range_params + { + std::size_t base_diag_count = 0; + std::size_t steps_between_two_base_diags = 0; + std::uint8_t chunk = 0; + _IdType steps = 0; + }; + + // Calculate nd-range params + template + nd_range_params + eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const + { + using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), + _Range1ValueType, _Range2ValueType>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + const _IdType __base_diag_count = 32 * 1'024; + const _IdType __steps_between_two_base_diags = + oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + + return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; + } + + // Calculation of split points on each base diagonal + template + sycl::event + eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, + const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + return __exec.queue().submit([&__rng1, &__rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage, + __n1, __n2, __n](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); + + __cgh.parallel_for<_DiagonalsKernelName...>( + sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + + _split_point_t<_IdType> __sp = + __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; + + if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) + { + const _IdType __i_elem = + __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; + if (__i_elem < __n) + __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + } + + __base_diagonals_sp_global_ptr[__global_idx] = __sp; + }); + }); + } + + // Process parallel merge + template + sycl::event + run_parallel_merge(const sycl::event& __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, + _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, + const _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __comp, __nd_range_params, + __base_diagonals_sp_global_storage, __n1, __n2](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + + __cgh.depends_on(__event); + + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __nd_range_params.chunk; + + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; + + _split_point_t<_IdType> __start; + if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) + { + const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; + + __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, + __sp_left.second, __sp_right.second, __i_elem, __comp); + } + else + { + __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; + } + + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, + __nd_range_params.chunk, __n1, __n2, __comp); + }); + }); + } + + public: + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + assert(__rng1.size() > 0 || __rng2.size() > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Calculate nd-range params + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); + + __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; + + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + auto __p_base_diagonals_sp_global_storage = + new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( + __exec, 0, __nd_range_params.base_diag_count + 1); + __p_result_and_scratch_storage_base.reset( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); + } +}; + +template +class __merge_kernel_name; + +template +class __merge_kernel_name_large; + +template +class __diagonals_kernel_name; + +template +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() +{ + return 4 * 1'048'576; // 4 MB +} + +template <> +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() +{ + return 16 * 1'048'576; // 8 MB +} + +template +auto +__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, + _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) +{ + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) + { + using _WiIndex = std::uint32_t; + static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= + std::numeric_limits<_WiIndex>::max()); + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + } +} + +} // namespace __par_backend_hetero +} // namespace dpl +} // namespace oneapi + +#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H From ab004c56702037b4a5409f5b7c18588fee009470 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 18 Dec 2024 21:23:53 +0100 Subject: [PATCH 049/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix compile error after merge changes from main branch Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 68d86296fb5..5c71754edc5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -266,7 +266,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, { std::size_t base_diag_count = 0; std::size_t steps_between_two_base_diags = 0; - std::uint8_t chunk = 0; + _IdType chunk = 0; _IdType steps = 0; }; From c11e177789937c398d706c7084df985185726908 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 09:37:47 +0100 Subject: [PATCH 050/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: declare all internal staff in __parallel_merge_submitter_large as private Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 5c71754edc5..3efe8888bf1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -261,7 +261,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, __internal::__optional_kernel_name<_MergeKernelName...>> { - protected: + private: struct nd_range_params { std::size_t base_diag_count = 0; From 79af1a80bc3020f15e3df20d38b401cf2e9b8133 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 09:59:38 +0100 Subject: [PATCH 051/144] test/parallel_api/algorithm/alg.merge/merge.pass.cpp - expant test for long data sizes Signed-off-by: Sergey Kopienko --- .../algorithm/alg.merge/merge.pass.cpp | 43 +++++++++++++------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp index 34cba9f672a..2715256f3a1 100644 --- a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp +++ b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp @@ -97,24 +97,18 @@ struct test_merge_compare } }; -template +template void -test_merge_by_type(Generator1 generator1, Generator2 generator2) +test_merge_by_type(Generator1 generator1, Generator2 generator2, size_t start_size, size_t max_size, FStep fstep) { using namespace std; - size_t max_size = 100000; Sequence in1(max_size, generator1); Sequence in2(max_size / 2, generator2); Sequence out(in1.size() + in2.size()); ::std::sort(in1.begin(), in1.end()); ::std::sort(in2.begin(), in2.end()); - size_t start_size = 0; -#if TEST_DPCPP_BACKEND_PRESENT - start_size = 2; -#endif - - for (size_t size = start_size; size <= max_size; size = size <= 16 ? size + 1 : size_t(3.1415 * size)) { + for (size_t size = start_size; size <= max_size; size = fstep(size)) { #if !TEST_DPCPP_BACKEND_PRESENT invoke_on_all_policies<0>()(test_merge(), in1.cbegin(), in1.cbegin() + size, in2.data(), in2.data() + size / 2, out.begin(), out.begin() + 1.5 * size); @@ -139,6 +133,16 @@ test_merge_by_type(Generator1 generator1, Generator2 generator2) } } +template +void +test_merge_by_type(size_t start_size, size_t max_size, FStep fstep) +{ + test_merge_by_type([](size_t v) { return (v % 2 == 0 ? v : -v) * 3; }, [](size_t v) { return v * 2; }, start_size, max_size, fstep); +#if !ONEDPL_FPGA_DEVICE + test_merge_by_type([](size_t v) { return float64_t(v); }, [](size_t v) { return float64_t(v - 100); }, start_size, max_size, fstep); +#endif +} + template struct test_non_const { @@ -166,9 +170,24 @@ struct test_merge_tuple int main() { - test_merge_by_type([](size_t v) { return (v % 2 == 0 ? v : -v) * 3; }, [](size_t v) { return v * 2; }); -#if !ONEDPL_FPGA_DEVICE - test_merge_by_type([](size_t v) { return float64_t(v); }, [](size_t v) { return float64_t(v - 100); }); +#if TEST_DPCPP_BACKEND_PRESENT + const size_t start_size_small = 2; +#else + const size_t start_size_small = 0; +#endif + const size_t max_size_small = 100000; + auto fstep_small = [](std::size_t size){ return size <= 16 ? size + 1 : size_t(3.1415 * size);}; + test_merge_by_type(start_size_small, max_size_small, fstep_small); + + // Large data sizes (on GPU only) +#if TEST_DPCPP_BACKEND_PRESENT + if (!TestUtils::get_test_queue().get_device().is_cpu()) + { + const size_t start_size_large = 4'000'000; + const size_t max_size_large = 8'000'000; + auto fstep_large = [](std::size_t size){ return size + 2'000'000; }; + test_merge_by_type(start_size_large, max_size_large, fstep_large); + } #endif #if !TEST_DPCPP_BACKEND_PRESENT From 9421906df13e63f744ead7602457cb63c76c11c9 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 17:07:05 +0100 Subject: [PATCH 052/144] Fix mistakes of main branch merge --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 +++--- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 32d61578d85..ee2c31bb99b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -263,11 +263,11 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _I } // Please see the comment for __parallel_for_submitter for optional kernel name explanation -template +template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>> { template auto diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index d0b19bbe77a..9c541b6c051 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -359,7 +359,8 @@ struct __merge_sort_global_submitter<_IndexT, inline static _merge_split_point_t __find_start_point_w(const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp) { - return __find_start_point(__views.rng1, __views.rng2, __data_area.i_elem_local, __data_area.n1, __data_area.n2, __comp); + return __find_start_point(__views.rng1, decltype(__data_area.n1){0}, __data_area.n1, __views.rng2, + decltype(__data_area.n2){0}, __data_area.n2, __data_area.i_elem_local, __comp); } template @@ -576,7 +577,7 @@ struct __merge_sort_global_submitter<_IndexT, public: - using __container_of_temp_storages_t = std::vector<__result_and_scratch_storage_base_ptr>; + using __container_of_temp_storages_t = std::vector>; template std::tuple From c4f804dedb624c0664d57acf33950c755777f998 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 17:06:05 +0100 Subject: [PATCH 053/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - let's move __event_chain into returns __future Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 9c541b6c051..ccde206981b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -639,7 +639,7 @@ struct __merge_sort_global_submitter<_IndexT, __data_in_temp = !__data_in_temp; } - return {__event_chain, __data_in_temp, std::move(__temp_sp_storages)}; + return {std::move(__event_chain), __data_in_temp, std::move(__temp_sp_storages)}; } }; From a6654e36d89571076872000a646be81214fbd968 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 17:14:45 +0100 Subject: [PATCH 054/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - avoid if - if - else - else in __find_or_eval_sp Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 25 +++++-------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index ccde206981b..ccdf9303336 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -451,25 +451,12 @@ struct __merge_sort_global_submitter<_IndexT, const _merge_split_point_t __sp_left = __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{ 0, 0 }; const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; - if (__sp_right.first + __sp_right.second > 0) - { - if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) - { - __result = __find_start_point_in(__views.rng1, __sp_left.first, __sp_right.first, - __views.rng2, __sp_left.second, __sp_right.second, - __data_area.i_elem_local, __comp); - } - else - { - __result = __sp_left; - } - } - else - { - __result = __find_start_point_w(__data_area, __views, __comp); - } - - return __result; + return __sp_right.first + __sp_right.second > 0 + ? (__global_idx % __nd_range_params.steps_between_two_base_diags != 0 + ? __find_start_point_in(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, + __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp) + : __sp_left) + : __find_start_point_w(__data_area, __views, __comp); } // Process parallel merge From 766099791891c08e487d895ddb5a0fc9b4950f78 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 17:15:40 +0100 Subject: [PATCH 055/144] declare __find_or_eval_sp as inline Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index ccdf9303336..c1c47342e88 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -434,12 +434,9 @@ struct __merge_sort_global_submitter<_IndexT, } template - static _merge_split_point_t - __find_or_eval_sp(const std::size_t __global_idx, - const nd_range_params& __nd_range_params, - const WorkDataArea& __data_area, - const DropViews& __views, - _Compare __comp, + inline static _merge_split_point_t + __find_or_eval_sp(const std::size_t __global_idx, const nd_range_params& __nd_range_params, + const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp, _BaseDiagonalsSPStorage __base_diagonals_sp_global_ptr) { _merge_split_point_t __result(0, 0); From e0388c48a723c4316465627cf4cd01f279f39dc1 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 17:18:07 +0100 Subject: [PATCH 056/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - remove local variables from __subgroup_bubble_sorter::sort Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index c1c47342e88..d5fbb872926 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -47,12 +47,10 @@ struct __subgroup_bubble_sorter { for (std::uint32_t j = __start + 1; j < __start + __end - i; ++j) { - auto& __first_item = __storage_acc[j - 1]; - auto& __second_item = __storage_acc[j]; - if (__comp(__second_item, __first_item)) + if (__comp(__storage_acc[j], __storage_acc[j - 1])) { using std::swap; - swap(__first_item, __second_item); + swap(__storage_acc[j - 1], __storage_acc[j]); } } } From 247dda955315ffde7c645d50902c8f84a7dfae82 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 17:20:04 +0100 Subject: [PATCH 057/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - remove extra logic from __merge_sort_global_submitter::eval_nd_range_params Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index d5fbb872926..2e05f380522 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -342,12 +342,8 @@ struct __merge_sort_global_submitter<_IndexT, const std::uint32_t __chunk = __is_cpu ? 32 : 4; const std::size_t __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_size, __chunk); + // TODO required to evaluate this value based on available SLM size for each work-group. _IndexT __base_diag_count = 32 * 1'024; // 32 Kb - - while (__n_sorted <= __base_diag_count) - __n_sorted = __n_sorted * 2; - __base_diag_count = __n_sorted / 2; - _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); return { __base_diag_count, __steps_between_two_base_diags, __chunk, __steps }; From d52418ab40127f0f8fe5f0a7887be8c9e172f4b4 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 17:25:35 +0100 Subject: [PATCH 058/144] Applying __get_starting_size_limit_for_large_submitter in __merge_sort_global_submitter::operator() Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 2e05f380522..a8f9deb527c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -240,8 +240,6 @@ struct __merge_sort_global_submitter<_IndexT, using _merge_split_point_t = _split_point_t<_IndexT>; - static constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 MB - struct nd_range_params { std::size_t base_diag_count = 0; @@ -567,6 +565,8 @@ struct __merge_sort_global_submitter<_IndexT, bool __data_in_temp = false; + using __value_type = oneapi::dpl::__internal::__value_t<_Range>; + // Calculate nd-range params const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n, __n_sorted); @@ -583,7 +583,15 @@ struct __merge_sort_global_submitter<_IndexT, for (std::int64_t __i = 0; __i < __n_iter; ++__i) { - if (2 * __n_sorted >= __starting_size_limit_for_large_submitter) + if (2 * __n_sorted < __get_starting_size_limit_for_large_submitter<__value_type>()) + { + // Process parallel merge + __event_chain = run_parallel_merge(__event_chain, + __n_sorted, __data_in_temp, + __exec, __rng, __temp_buf, __comp, + __nd_range_params); + } + else { // Create storage for save split-points on each base diagonal // - for current iteration @@ -604,14 +612,6 @@ struct __merge_sort_global_submitter<_IndexT, __nd_range_params, *__p_base_diagonals_sp_storage); } - else - { - // Process parallel merge - __event_chain = run_parallel_merge(__event_chain, - __n_sorted, __data_in_temp, - __exec, __rng, __temp_buf, __comp, - __nd_range_params); - } __n_sorted *= 2; __data_in_temp = !__data_in_temp; From 6d532793a3011219510e80a1dcadba2df14efa75 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 17:33:19 +0100 Subject: [PATCH 059/144] Fix mistakes of main branch merge Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index a8f9deb527c..d31ccd9601d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -337,8 +337,8 @@ struct __merge_sort_global_submitter<_IndexT, eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size, _IndexT __n_sorted) const { const bool __is_cpu = __exec.queue().get_device().is_cpu(); - const std::uint32_t __chunk = __is_cpu ? 32 : 4; - const std::size_t __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_size, __chunk); + const _IndexT __chunk = __is_cpu ? 32 : 4; + const _IndexT __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_size, __chunk); // TODO required to evaluate this value based on available SLM size for each work-group. _IndexT __base_diag_count = 32 * 1'024; // 32 Kb From 8723c98eed002f8aa45a879c9fc74eda7b5f2adc Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 17:58:23 +0100 Subject: [PATCH 060/144] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 152 ++++++++---------- 1 file changed, 66 insertions(+), 86 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index d31ccd9601d..f71413a92e7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -178,7 +178,7 @@ struct __leaf_sorter // 3. Sort on work-group level bool __data_in_temp = __group_sorter.sort(__item, __storage_acc, __comp, static_cast(0), __adjusted_process_size, - /*sorted per sub-group*/ __data_per_workitem, __data_per_workitem, __workgroup_size); + /*sorted per sub-group*/ __data_per_workitem, __data_per_workitem, __workgroup_size); // barrier is not needed here because of the barrier inside the sort method // 4. Store @@ -230,14 +230,13 @@ struct __merge_sort_leaf_submitter<__internal::__optional_kernel_name<_LeafSortN template struct __merge_sort_global_submitter; -template -struct __merge_sort_global_submitter<_IndexT, - __internal::__optional_kernel_name<_DiagonalsKernelName...>, +template +struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name<_DiagonalsKernelName...>, __internal::__optional_kernel_name<_GlobalSortName1...>, __internal::__optional_kernel_name<_GlobalSortName2...>> { -protected: - + protected: using _merge_split_point_t = _split_point_t<_IndexT>; struct nd_range_params @@ -251,7 +250,7 @@ struct __merge_sort_global_submitter<_IndexT, struct WorkDataArea { // How WorkDataArea is implemented : - // + // // i_elem_local // | // offset | i_elem @@ -286,14 +285,13 @@ struct __merge_sort_global_submitter<_IndexT, // | | // +---------------------------+ - _IndexT i_elem = 0; // Global diagonal index - _IndexT i_elem_local = 0; // Local diagonal index - _IndexT offset = 0; // Offset to the first element in the subrange (i.e. the first element of the first subrange for merge) - _IndexT n1 = 0; // Size of the first subrange - _IndexT n2 = 0; // Size of the second subrange + _IndexT i_elem = 0; // Global diagonal index + _IndexT i_elem_local = 0; // Local diagonal index + _IndexT offset = 0; // Offset to the first element in the subrange (i.e. the first element of the first subrange for merge) + _IndexT n1 = 0; // Size of the first subrange + _IndexT n2 = 0; // Size of the second subrange - WorkDataArea(const std::size_t __n, const std::size_t __n_sorted, - const std::size_t __linear_id, + WorkDataArea(const std::size_t __n, const std::size_t __n_sorted, const std::size_t __linear_id, const std::size_t __chunk) { // Calculate global diagonal index @@ -326,9 +324,9 @@ struct __merge_sort_global_submitter<_IndexT, __drop_view_simple_t rng2; DropViews(Rng& __rng, const WorkDataArea& __data_area) - : rng1(__rng, __data_area.offset) - , rng2(__rng, __data_area.offset + __data_area.n1) - {} + : rng1(__rng, __data_area.offset), rng2(__rng, __data_area.offset + __data_area.n1) + { + } }; // Calculate nd-range params @@ -341,40 +339,34 @@ struct __merge_sort_global_submitter<_IndexT, const _IndexT __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_size, __chunk); // TODO required to evaluate this value based on available SLM size for each work-group. - _IndexT __base_diag_count = 32 * 1'024; // 32 Kb + _IndexT __base_diag_count = 32 * 1'024; // 32 Kb _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - return { __base_diag_count, __steps_between_two_base_diags, __chunk, __steps }; + return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; } template - inline - static _merge_split_point_t __find_start_point_w(const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp) + inline static _merge_split_point_t + __find_start_point_w(const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp) { return __find_start_point(__views.rng1, decltype(__data_area.n1){0}, __data_area.n1, __views.rng2, decltype(__data_area.n2){0}, __data_area.n2, __data_area.i_elem_local, __comp); } template - inline - static void __serial_merge_w(const nd_range_params& __nd_range_params, - const WorkDataArea& __data_area, - const DropViews& __views, _Rng& __rng, - const _merge_split_point_t& __sp, - _Compare __comp) + inline static void + __serial_merge_w(const nd_range_params& __nd_range_params, const WorkDataArea& __data_area, + const DropViews& __views, _Rng& __rng, const _merge_split_point_t& __sp, _Compare __comp) { - __serial_merge(__views.rng1, __views.rng2, __rng /* rng3 */, - __sp.first /* start1 */, __sp.second /* start2 */, __data_area.i_elem /* start3 */, - __nd_range_params.chunk, - __data_area.n1, __data_area.n2, + __serial_merge(__views.rng1, __views.rng2, __rng /* rng3 */, __sp.first /* start1 */, __sp.second /* start2 */, + __data_area.i_elem /* start3 */, __nd_range_params.chunk, __data_area.n1, __data_area.n2, __comp); } // Calculation of split points on each base diagonal template sycl::event - eval_split_points_for_groups(const sycl::event& __event_chain, - const _IndexT __n_sorted, const bool __data_in_temp, + eval_split_points_for_groups(const sycl::event& __event_chain, const _IndexT __n_sorted, const bool __data_in_temp, _ExecutionPolicy&& __exec, _Range&& __rng, _TempBuf& __temp_buf, _Compare __comp, const nd_range_params& __nd_range_params, _Storage& __base_diagonals_sp_global_storage) const @@ -382,12 +374,12 @@ struct __merge_sort_global_submitter<_IndexT, const _IndexT __n = __rng.size(); return __exec.queue().submit([&, __event_chain](sycl::handler& __cgh) { - __cgh.depends_on(__event_chain); oneapi::dpl::__ranges::__require_access(__cgh, __rng); - auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( - __cgh, __dpl_sycl::__no_init{}); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); sycl::accessor __dst(__temp_buf, __cgh, sycl::read_write, sycl::no_init); @@ -395,16 +387,19 @@ struct __merge_sort_global_submitter<_IndexT, // +1 doesn't required here, because we need to calculate split points for each base diagonal // and for the right base diagonal in the last work-group but we can keep it one position to the left // because we know that for 0-diagonal the split point is { 0, 0 }. - sycl::range(__nd_range_params.base_diag_count /*+ 1*/), [=](sycl::item __item_id) { - + sycl::range(__nd_range_params.base_diag_count /*+ 1*/), + [=](sycl::item __item_id) { const std::size_t __linear_id = __item_id.get_linear_id(); - auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); // We should add `1` to __linear_id here to avoid calculation of split-point for 0-diagonal - const WorkDataArea __data_area(__n, __n_sorted, __linear_id + 1, __nd_range_params.chunk * __nd_range_params.steps_between_two_base_diags); + const WorkDataArea __data_area(__n, __n_sorted, __linear_id + 1, + __nd_range_params.chunk * + __nd_range_params.steps_between_two_base_diags); - _merge_split_point_t __sp{ 0, 0}; + _merge_split_point_t __sp{0, 0}; if (__data_area.is_i_elem_local_inside_merge_matrix()) { @@ -437,7 +432,8 @@ struct __merge_sort_global_submitter<_IndexT, assert(__diagonal_idx < __nd_range_params.base_diag_count); - const _merge_split_point_t __sp_left = __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{ 0, 0 }; + const _merge_split_point_t __sp_left = + __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{0, 0}; const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; return __sp_right.first + __sp_right.second > 0 @@ -451,15 +447,13 @@ struct __merge_sort_global_submitter<_IndexT, // Process parallel merge template sycl::event - run_parallel_merge(const sycl::event& __event_chain, - const _IndexT __n_sorted, const bool __data_in_temp, + run_parallel_merge(const sycl::event& __event_chain, const _IndexT __n_sorted, const bool __data_in_temp, _ExecutionPolicy&& __exec, _Range&& __rng, _TempBuf& __temp_buf, _Compare __comp, const nd_range_params& __nd_range_params) const { const _IndexT __n = __rng.size(); return __exec.queue().submit([&, __event_chain](sycl::handler& __cgh) { - __cgh.depends_on(__event_chain); oneapi::dpl::__ranges::__require_access(__cgh, __rng); @@ -467,7 +461,6 @@ struct __merge_sort_global_submitter<_IndexT, __cgh.parallel_for<_GlobalSortName1...>( sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - const std::size_t __linear_id = __item_id.get_linear_id(); const WorkDataArea __data_area(__n, __n_sorted, __linear_id, __nd_range_params.chunk); @@ -493,32 +486,29 @@ struct __merge_sort_global_submitter<_IndexT, } // Process parallel merge with usage of split-points on base diagonals - template + template sycl::event - run_parallel_merge(const sycl::event& __event_chain, - const _IndexT __n_sorted, const bool __data_in_temp, + run_parallel_merge(const sycl::event& __event_chain, const _IndexT __n_sorted, const bool __data_in_temp, _ExecutionPolicy&& __exec, _Range&& __rng, _TempBuf& __temp_buf, _Compare __comp, - const nd_range_params& __nd_range_params, - _Storage& __base_diagonals_sp_global_storage) const + const nd_range_params& __nd_range_params, _Storage& __base_diagonals_sp_global_storage) const { const _IndexT __n = __rng.size(); - return __exec.queue().submit([&,__event_chain](sycl::handler& __cgh) { - + return __exec.queue().submit([&, __event_chain](sycl::handler& __cgh) { __cgh.depends_on(__event_chain); oneapi::dpl::__ranges::__require_access(__cgh, __rng); sycl::accessor __dst(__temp_buf, __cgh, sycl::read_write, sycl::no_init); - auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); __cgh.parallel_for<_GlobalSortName2...>( sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - const std::size_t __linear_id = __item_id.get_linear_id(); - auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); const WorkDataArea __data_area(__n, __n_sorted, __linear_id, __nd_range_params.chunk); @@ -528,22 +518,18 @@ struct __merge_sort_global_submitter<_IndexT, { DropViews __views(__dst, __data_area); - const auto __sp = __find_or_eval_sp(__linear_id /* __global_idx */, - __nd_range_params, - __data_area, __views, - __comp, - __base_diagonals_sp_global_ptr); + const auto __sp = + __find_or_eval_sp(__linear_id /* __global_idx */, __nd_range_params, __data_area, + __views, __comp, __base_diagonals_sp_global_ptr); __serial_merge_w(__nd_range_params, __data_area, __views, __rng, __sp, __comp); } else { DropViews __views(__rng, __data_area); - const auto __sp = __find_or_eval_sp(__linear_id /* __global_idx */, - __nd_range_params, - __data_area, __views, - __comp, - __base_diagonals_sp_global_ptr); + const auto __sp = + __find_or_eval_sp(__linear_id /* __global_idx */, __nd_range_params, __data_area, + __views, __comp, __base_diagonals_sp_global_ptr); __serial_merge_w(__nd_range_params, __data_area, __views, __dst, __sp, __comp); } } @@ -551,8 +537,7 @@ struct __merge_sort_global_submitter<_IndexT, }); } -public: - + public: using __container_of_temp_storages_t = std::vector>; template @@ -586,31 +571,25 @@ struct __merge_sort_global_submitter<_IndexT, if (2 * __n_sorted < __get_starting_size_limit_for_large_submitter<__value_type>()) { // Process parallel merge - __event_chain = run_parallel_merge(__event_chain, - __n_sorted, __data_in_temp, - __exec, __rng, __temp_buf, __comp, - __nd_range_params); + __event_chain = run_parallel_merge(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, + __comp, __nd_range_params); } else { // Create storage for save split-points on each base diagonal // - for current iteration - auto __p_base_diagonals_sp_storage = new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params.base_diag_count); + auto __p_base_diagonals_sp_storage = + new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params.base_diag_count); __temp_sp_storages[__i].reset(__p_base_diagonals_sp_storage); // Calculation of split-points on each base diagonal - __event_chain = eval_split_points_for_groups(__event_chain, - __n_sorted, __data_in_temp, - __exec, __rng, __temp_buf, __comp, - __nd_range_params, - *__p_base_diagonals_sp_storage); + __event_chain = + eval_split_points_for_groups(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, + __comp, __nd_range_params, *__p_base_diagonals_sp_storage); // Process parallel merge with usage of split-points on base diagonals - __event_chain = run_parallel_merge(__event_chain, - __n_sorted, __data_in_temp, - __exec, __rng, __temp_buf, __comp, - __nd_range_params, - *__p_base_diagonals_sp_storage); + __event_chain = run_parallel_merge(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, + __comp, __nd_range_params, *__p_base_diagonals_sp_storage); } __n_sorted *= 2; @@ -690,8 +669,9 @@ __merge_sort(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSo // 2. Merge sorting oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, _Tp> __temp(__exec, __rng.size()); auto __temp_buf = __temp.get_buffer(); - auto [__event_sort, __data_in_temp, __temp_sp_storages] = __merge_sort_global_submitter<_IndexT, _DiagonalsKernelName, _GlobalSortKernel1, _GlobalSortKernel2>()( - __exec, __rng, __comp, __leaf_sorter.__process_size, __temp_buf, __event_leaf_sort); + auto [__event_sort, __data_in_temp, __temp_sp_storages] = + __merge_sort_global_submitter<_IndexT, _DiagonalsKernelName, _GlobalSortKernel1, _GlobalSortKernel2>()( + __exec, __rng, __comp, __leaf_sorter.__process_size, __temp_buf, __event_leaf_sort); // 3. If the data remained in the temporary buffer then copy it back if (__data_in_temp) From 57e1837bb9c5a5ecd44611d94956f59f296ce6f6 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 17:59:09 +0100 Subject: [PATCH 061/144] Fix review comment - declare all staff in __merge_sort_global_submitter as private Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index f71413a92e7..61148790a11 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -236,7 +236,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __internal::__optional_kernel_name<_GlobalSortName1...>, __internal::__optional_kernel_name<_GlobalSortName2...>> { - protected: + private: using _merge_split_point_t = _split_point_t<_IndexT>; struct nd_range_params From 0ce0f6d795e15a733163cd7bfeb38f945f15157e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 22 Dec 2024 16:05:15 +0100 Subject: [PATCH 062/144] test/parallel_api/algorithm/alg.sorting/sort.pass.cpp - extend test for the largest data sizes Signed-off-by: Sergey Kopienko --- .../algorithm/alg.sorting/sort.pass.cpp | 88 ++++++++++++------- 1 file changed, 56 insertions(+), 32 deletions(-) diff --git a/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp b/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp index e39dee0f8b9..077c33ac8ba 100644 --- a/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp +++ b/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp @@ -360,11 +360,11 @@ test_default_name_gen(Convert convert, size_t n) #endif //TEST_DPCPP_BACKEND_PRESENT -template <::std::size_t CallNumber, typename T, typename Compare, typename Convert> +template <::std::size_t CallNumber, typename T, typename Compare, typename Convert, typename FStep> void -test_sort(Compare compare, Convert convert) +test_sort(Compare compare, Convert convert, size_t start_size, size_t max_size, FStep fstep) { - for (size_t n = 0; n < 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + for (size_t n = start_size; n <= max_size; n = fstep(n)) { LastIndex = n + 2; // The rand()%(2*n+1) encourages generation of some duplicates. @@ -408,48 +408,42 @@ struct NonConstCmp } }; -int -main() +template +void +test_sort(size_t start_size, size_t max_size, FStep fstep) { - ::std::srand(42); - std::int32_t start = 0; - std::int32_t end = 2; -#ifndef _PSTL_TEST_SORT - start = 1; -#endif // #ifndef _PSTL_TEST_SORT -#ifndef _PSTL_TEST_STABLE_SORT - end = 1; -#endif // _PSTL_TEST_STABLE_SORT - for (std::int32_t kind = start; kind < end; ++kind) - { - Stable = kind != 0; - #if !TEST_DPCPP_BACKEND_PRESENT // ParanoidKey has atomic increment in ctors. It's not allowed in kernel test_sort<0, ParanoidKey>(KeyCompare(TestUtils::OddTag()), - [](size_t k, size_t val) { return ParanoidKey(k, val, TestUtils::OddTag()); }); + [](size_t k, size_t val) { return ParanoidKey(k, val, TestUtils::OddTag()); }, + start_size, max_size, fstep); #endif // !TEST_DPCPP_BACKEND_PRESENT #if !ONEDPL_FPGA_DEVICE - test_sort<10, TestUtils::float32_t>([](TestUtils::float32_t x, TestUtils::float32_t y) { return x < y; }, - [](size_t k, size_t val) - { return TestUtils::float32_t(val) * (k % 2 ? 1 : -1); }); + test_sort([](TestUtils::float32_t x, TestUtils::float32_t y) { return x < y; }, + [](size_t k, size_t val) + { return TestUtils::float32_t(val) * (k % 2 ? 1 : -1); }, + start_size, max_size, fstep); - test_sort<20, unsigned char>([](unsigned char x, unsigned char y) - { return x > y; }, // Reversed so accidental use of < will be detected. - [](size_t k, size_t val) { return (unsigned char)val; }); + test_sort([](unsigned char x, unsigned char y) + { return x > y; }, // Reversed so accidental use of < will be detected. + [](size_t k, size_t val) { return (unsigned char)val; }, + start_size, max_size, fstep); - test_sort<30, unsigned char>(NonConstCmp{}, [](size_t k, size_t val) { return (unsigned char)val; }); + test_sort(NonConstCmp{}, [](size_t k, size_t val) { return (unsigned char)val; }, + start_size, max_size, fstep); #endif // !ONEDPL_FPGA_DEVICE - test_sort<40, std::int32_t>([](std::int32_t x, std::int32_t y) - { return x > y; }, // Reversed so accidental use of < will be detected. - [](size_t k, size_t val) { return std::int32_t(val) * (k % 2 ? 1 : -1); }); + test_sort([](std::int32_t x, std::int32_t y) + { return x > y; }, // Reversed so accidental use of < will be detected. + [](size_t k, size_t val) { return std::int32_t(val) * (k % 2 ? 1 : -1); }, + start_size, max_size, fstep); - test_sort<50, std::int16_t>( + test_sort( std::greater(), [](size_t k, size_t val) { - return std::int16_t(val) * (k % 2 ? 1 : -1); }); + return std::int16_t(val) * (k % 2 ? 1 : -1); }, + start_size, max_size, fstep); #if TEST_DPCPP_BACKEND_PRESENT auto convert = [](size_t k, size_t val) { @@ -466,8 +460,38 @@ main() } return sycl::bit_cast(raw); }; - test_sort<60, sycl::half>(std::greater(), convert); + test_sort(std::greater(), convert, + start_size, max_size, fstep); #endif +} + +int +main() +{ + ::std::srand(42); + std::int32_t start = 0; + std::int32_t end = 2; +#ifndef _PSTL_TEST_SORT + start = 1; +#endif // #ifndef _PSTL_TEST_SORT +#ifndef _PSTL_TEST_STABLE_SORT + end = 1; +#endif // _PSTL_TEST_STABLE_SORT + + const size_t start_size_small = 0; + const size_t max_size_small = 100'000; + auto fstep_small = [](std::size_t size){ return size <= 16 ? size + 1 : size_t(3.1415 * size);}; + + const size_t start_size_large = 4'000'000; + const size_t max_size_large = 8'000'000; + auto fstep_large = [](std::size_t size){ return size + 2'000'000; }; + + for (std::int32_t kind = start; kind < end; ++kind) + { + Stable = kind != 0; + + test_sort<100>(start_size_small, max_size_small, fstep_small); + test_sort<200>(start_size_large, max_size_large, fstep_large); } #if TEST_DPCPP_BACKEND_PRESENT From 46d6a38f4ca47e6779f68a6d93e1d41264c1bce6 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 22 Dec 2024 12:53:44 +0100 Subject: [PATCH 063/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h -remove unused local variable Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 61148790a11..2e9adf0c7b7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -426,8 +426,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp, _BaseDiagonalsSPStorage __base_diagonals_sp_global_ptr) { - _merge_split_point_t __result(0, 0); - std::size_t __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; assert(__diagonal_idx < __nd_range_params.base_diag_count); From 7251041abfc0cc967ba53f18e2b2be8092fec764 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 22 Dec 2024 12:53:06 +0100 Subject: [PATCH 064/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - rename __find_or_eval_sp to __lookup_sp Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 2e9adf0c7b7..2ea8cba95a8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -422,7 +422,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name template inline static _merge_split_point_t - __find_or_eval_sp(const std::size_t __global_idx, const nd_range_params& __nd_range_params, + __lookup_sp(const std::size_t __global_idx, const nd_range_params& __nd_range_params, const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp, _BaseDiagonalsSPStorage __base_diagonals_sp_global_ptr) { @@ -516,18 +516,16 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name { DropViews __views(__dst, __data_area); - const auto __sp = - __find_or_eval_sp(__linear_id /* __global_idx */, __nd_range_params, __data_area, - __views, __comp, __base_diagonals_sp_global_ptr); + const auto __sp = __lookup_sp(__linear_id /* __global_idx */, __nd_range_params, + __data_area, __views, __comp, __base_diagonals_sp_global_ptr); __serial_merge_w(__nd_range_params, __data_area, __views, __rng, __sp, __comp); } else { DropViews __views(__rng, __data_area); - const auto __sp = - __find_or_eval_sp(__linear_id /* __global_idx */, __nd_range_params, __data_area, - __views, __comp, __base_diagonals_sp_global_ptr); + const auto __sp = __lookup_sp(__linear_id /* __global_idx */, __nd_range_params, + __data_area, __views, __comp, __base_diagonals_sp_global_ptr); __serial_merge_w(__nd_range_params, __data_area, __views, __dst, __sp, __comp); } } From 7906635803befb86d234cac2da5a8e5b68027e76 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 22 Dec 2024 22:26:26 +0100 Subject: [PATCH 065/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix an error in tests Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 2ea8cba95a8..7a2c6e4fbe0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -340,6 +340,12 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // TODO required to evaluate this value based on available SLM size for each work-group. _IndexT __base_diag_count = 32 * 1'024; // 32 Kb + + // TODO required to rewrite this without loop + while (__n_sorted <= __base_diag_count) + __n_sorted = __n_sorted * 2; + __base_diag_count = __n_sorted / 2; + _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; From 8e9159bcae86fa78c8a783241266f5fe5bb49a94 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 22 Dec 2024 22:40:04 +0100 Subject: [PATCH 066/144] Revert "include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix an error in tests" This reverts commit 7906635803befb86d234cac2da5a8e5b68027e76. --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 7a2c6e4fbe0..2ea8cba95a8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -340,12 +340,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // TODO required to evaluate this value based on available SLM size for each work-group. _IndexT __base_diag_count = 32 * 1'024; // 32 Kb - - // TODO required to rewrite this without loop - while (__n_sorted <= __base_diag_count) - __n_sorted = __n_sorted * 2; - __base_diag_count = __n_sorted / 2; - _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; From 557c3f31fcba6fedf8bcf30bcd47864ebc78f693 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 22 Dec 2024 23:12:13 +0100 Subject: [PATCH 067/144] Fix error: __find_start_point_in has been renamed to __find_start_point earlier Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 98 ------------------- .../dpcpp/parallel_backend_sycl_merge_sort.h | 4 +- 2 files changed, 2 insertions(+), 100 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index ee2c31bb99b..36860f2d449 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -130,104 +130,6 @@ __find_start_point(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_ return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; } -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -_split_point_t<_Index> -__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, - const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) -{ - // ----------------------- EXAMPLE ------------------------ - // Let's consider the following input data: - // rng1.size() = 10 - // rng2.size() = 6 - // i_diag = 9 - // Let's define the following ranges for processing: - // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 - // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 - // - // The goal: required to process only X' items of the merge matrix - // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) - // - // -------------------------------------------------------- - // - // __diag_it_begin(rng1) __diag_it_end(rng1) - // (init state) (dest state) (init state, dest state) - // | | | - // V V V - // + + + + + + - // \ rng1 0 1 2 3 4 5 6 7 8 9 - // rng2 +--------------------------------------+ - // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) - // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) - // + 2 | <----------------- + X'1 | | - // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) - // 4 | X ^ | | - // 5 | X | | | <--- __diag_it_begin(rng2) (init state) - // +-------AX-----------+-----------+-----+ - // AX | | - // AX | | - // Run lower_bound:[from = 5, to = 8) - // - // AX - absent items in rng2 - // - // We have three points on diagonal for call comparison: - // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 - // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 - // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 - // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 - - //////////////////////////////////////////////////////////////////////////////////// - // Taking into account the specified constraints of the range of processed data - const auto __index_sum = __i_elem - 1; - - using _IndexSigned = std::make_signed_t<_Index>; - - _IndexSigned idx1_from = __rng1_from; - _IndexSigned idx1_to = __rng1_to; - - _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); - _IndexSigned idx2_to = __index_sum - __rng1_from + 1; - - const _IndexSigned idx2_from_diff = - idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; - const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; - - idx1_to -= idx2_from_diff; - idx1_from += idx2_to_diff; - - idx2_from = __index_sum - (idx1_to - 1); - idx2_to = __index_sum - idx1_from + 1; - - //////////////////////////////////////////////////////////////////////////////////// - // Run search of split point on diagonal - - using __it_t = oneapi::dpl::counting_iterator<_Index>; - - __it_t __diag_it_begin(idx1_from); - __it_t __diag_it_end(idx1_to); - - constexpr int kValue = 1; - const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, - [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { - const auto __zero_or_one = - __comp(__rng2[__index_sum - __idx], __rng1[__idx]); - return __zero_or_one < kValue; - }); - - return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; -} - // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing // to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) template diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 2ea8cba95a8..70996258eac 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -436,8 +436,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name return __sp_right.first + __sp_right.second > 0 ? (__global_idx % __nd_range_params.steps_between_two_base_diags != 0 - ? __find_start_point_in(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, - __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp) + ? __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, + __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp) : __sp_left) : __find_start_point_w(__data_area, __views, __comp); } From 191d6086aaadfd3167785dece6ff188bbe8db8bb Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 22 Dec 2024 23:14:54 +0100 Subject: [PATCH 068/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix an error in tests Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 70996258eac..93b758cf0cd 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -332,7 +332,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Calculate nd-range params template nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size, _IndexT __n_sorted) const + eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size) const { const bool __is_cpu = __exec.queue().get_device().is_cpu(); const _IndexT __chunk = __is_cpu ? 32 : 4; @@ -549,7 +549,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name using __value_type = oneapi::dpl::__internal::__value_t<_Range>; // Calculate nd-range params - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n, __n_sorted); + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n); using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _merge_split_point_t>; @@ -572,20 +572,25 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } else { + const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); + nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted)); + __nd_range_params_this.steps *= __portions; + __nd_range_params_this.base_diag_count *= __portions; + // Create storage for save split-points on each base diagonal // - for current iteration auto __p_base_diagonals_sp_storage = - new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params.base_diag_count); + new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params_this.base_diag_count); __temp_sp_storages[__i].reset(__p_base_diagonals_sp_storage); // Calculation of split-points on each base diagonal __event_chain = eval_split_points_for_groups(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, - __comp, __nd_range_params, *__p_base_diagonals_sp_storage); + __comp, __nd_range_params_this, *__p_base_diagonals_sp_storage); // Process parallel merge with usage of split-points on base diagonals __event_chain = run_parallel_merge(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, - __comp, __nd_range_params, *__p_base_diagonals_sp_storage); + __comp, __nd_range_params_this, *__p_base_diagonals_sp_storage); } __n_sorted *= 2; From ecd7b481263704a99e265b3c78cc4ec8ea5750cc Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 22 Dec 2024 23:15:32 +0100 Subject: [PATCH 069/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - refactoring of __merge_sort_global_submitter __lookup_sp Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 93b758cf0cd..4f75731432c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -434,10 +434,12 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{0, 0}; const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const bool __is_base_diagonal = __global_idx % __nd_range_params.steps_between_two_base_diags == 0; + return __sp_right.first + __sp_right.second > 0 - ? (__global_idx % __nd_range_params.steps_between_two_base_diags != 0 - ? __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, - __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp) + ? (!__is_base_diagonal + ? __find_start_point_in(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, + __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp) : __sp_left) : __find_start_point_w(__data_area, __views, __comp); } From 878e1fd76c66615fc93a6322dc9df75dc0d952eb Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 22 Dec 2024 23:24:30 +0100 Subject: [PATCH 070/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - refactoring of __merge_sort_global_submitter::eval_split_points_for_groups Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 4f75731432c..cfc4c843055 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -399,23 +399,12 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __nd_range_params.chunk * __nd_range_params.steps_between_two_base_diags); - _merge_split_point_t __sp{0, 0}; - - if (__data_area.is_i_elem_local_inside_merge_matrix()) - { - if (__data_in_temp) - { - DropViews __views(__dst, __data_area); - __sp = __find_start_point_w(__data_area, __views, __comp); - } - else - { - DropViews __views(__rng, __data_area); - __sp = __find_start_point_w(__data_area, __views, __comp); - } - } - - __base_diagonals_sp_global_ptr[__linear_id] = __sp; + __base_diagonals_sp_global_ptr[__linear_id] = + __data_area.is_i_elem_local_inside_merge_matrix() + ? (__data_in_temp + ? __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp) + : __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp)) + : _merge_split_point_t{__data_area.n1, __data_area.n2}; }); }); } From 238c90c9837567037dbd97b9ff66d65bba06532a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 22 Dec 2024 23:27:49 +0100 Subject: [PATCH 071/144] test/parallel_api/algorithm/alg.sorting/sort.pass.cpp - test sort for largest data sizes on GPU only Signed-off-by: Sergey Kopienko --- test/parallel_api/algorithm/alg.sorting/sort.pass.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp b/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp index 077c33ac8ba..e6b5ba36829 100644 --- a/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp +++ b/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp @@ -482,16 +482,23 @@ main() const size_t max_size_small = 100'000; auto fstep_small = [](std::size_t size){ return size <= 16 ? size + 1 : size_t(3.1415 * size);}; + // Large data sizes (on GPU only) +#if TEST_DPCPP_BACKEND_PRESENT const size_t start_size_large = 4'000'000; const size_t max_size_large = 8'000'000; auto fstep_large = [](std::size_t size){ return size + 2'000'000; }; +#endif for (std::int32_t kind = start; kind < end; ++kind) { Stable = kind != 0; test_sort<100>(start_size_small, max_size_small, fstep_small); + + // Large data sizes (on GPU only) +#if TEST_DPCPP_BACKEND_PRESENT test_sort<200>(start_size_large, max_size_large, fstep_large); +#endif } #if TEST_DPCPP_BACKEND_PRESENT From dc917ff738f86aa4151621c9009dc51604e9b7c4 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 22 Dec 2024 23:29:18 +0100 Subject: [PATCH 072/144] test/parallel_api/algorithm/alg.sorting/sort.pass.cpp - test sort for largest data sizes on GPU only Signed-off-by: Sergey Kopienko --- .../algorithm/alg.sorting/sort.pass.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp b/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp index e6b5ba36829..500e8f43035 100644 --- a/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp +++ b/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp @@ -482,22 +482,22 @@ main() const size_t max_size_small = 100'000; auto fstep_small = [](std::size_t size){ return size <= 16 ? size + 1 : size_t(3.1415 * size);}; - // Large data sizes (on GPU only) -#if TEST_DPCPP_BACKEND_PRESENT - const size_t start_size_large = 4'000'000; - const size_t max_size_large = 8'000'000; - auto fstep_large = [](std::size_t size){ return size + 2'000'000; }; -#endif - for (std::int32_t kind = start; kind < end; ++kind) { Stable = kind != 0; test_sort<100>(start_size_small, max_size_small, fstep_small); - // Large data sizes (on GPU only) + // Large data sizes (on GPU only) #if TEST_DPCPP_BACKEND_PRESENT - test_sort<200>(start_size_large, max_size_large, fstep_large); + if (!TestUtils::get_test_queue().get_device().is_cpu()) + { + const size_t start_size_large = 4'000'000; + const size_t max_size_large = 8'000'000; + auto fstep_large = [](std::size_t size){ return size + 2'000'000; }; + + test_sort<200>(start_size_large, max_size_large, fstep_large); + } #endif } From d2b13554f441ed6f72d525528bedcfaa9b5be4cd Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 22 Dec 2024 23:31:25 +0100 Subject: [PATCH 073/144] Fix error: __find_start_point_in has been renamed to __find_start_point earlier Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index cfc4c843055..27d2fad5269 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -427,8 +427,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name return __sp_right.first + __sp_right.second > 0 ? (!__is_base_diagonal - ? __find_start_point_in(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, - __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp) + ? __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, + __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp) : __sp_left) : __find_start_point_w(__data_area, __views, __comp); } From e3085fc379f4a155f0244be8ad25aaf38fc1a112 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 22 Dec 2024 23:33:59 +0100 Subject: [PATCH 074/144] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 27d2fad5269..0bec910e931 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -287,7 +287,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name _IndexT i_elem = 0; // Global diagonal index _IndexT i_elem_local = 0; // Local diagonal index - _IndexT offset = 0; // Offset to the first element in the subrange (i.e. the first element of the first subrange for merge) + // Offset to the first element in the subrange (i.e. the first element of the first subrange for merge) + _IndexT offset = 0; _IndexT n1 = 0; // Size of the first subrange _IndexT n2 = 0; // Size of the second subrange @@ -412,8 +413,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name template inline static _merge_split_point_t __lookup_sp(const std::size_t __global_idx, const nd_range_params& __nd_range_params, - const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp, - _BaseDiagonalsSPStorage __base_diagonals_sp_global_ptr) + const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp, + _BaseDiagonalsSPStorage __base_diagonals_sp_global_ptr) { std::size_t __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; From be6a4f06753aa15a49fa4db530ecac8dba8c6d3b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 23 Dec 2024 00:00:30 +0100 Subject: [PATCH 075/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - additional explanations in the __merge_sort_global_submitter::__lookup_sp function Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 32 +++++++++++++++---- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 0bec910e931..b7c88c8e892 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -396,6 +396,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); // We should add `1` to __linear_id here to avoid calculation of split-point for 0-diagonal + // Please see additional explanations in the __lookup_sp function below. const WorkDataArea __data_area(__n, __n_sorted, __linear_id + 1, __nd_range_params.chunk * __nd_range_params.steps_between_two_base_diags); @@ -412,11 +413,26 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name template inline static _merge_split_point_t - __lookup_sp(const std::size_t __global_idx, const nd_range_params& __nd_range_params, + __lookup_sp(const std::size_t __linear_id_in_steps_range, const nd_range_params& __nd_range_params, const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp, _BaseDiagonalsSPStorage __base_diagonals_sp_global_ptr) { - std::size_t __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; + // | subrange 0 | subrange 1 | subrange 2 | subrange 3 | subrange 4 + // | contains (2 * __n_sorted values) | contains (2 * __n_sorted values) | contains (2 * __n_sorted values) | contains (2 * __n_sorted values) | contains the rest of data... < Data parts + // |----/----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----/----|----/--- < Steps + // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + // | | | | | | | | | | | | | | + // bd00 bd01 bd02 bd10 bd11 bd12 bd20 bd21 | bd22 bd30 bd31 bd32 bd40 < Base diagonals + // ^ ^ ^ ^ ^ ^ ^ | ^ ^ ^ ^ ^ + // --- 0 1 2 3 4 5 6 | 7 8 9 10 11 < Indexes in the base diagonal's SP storage + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 20 21 | 23 24 25 26 27 28 29 30 31 32 33 34 35 36 < Linear IDs: __linear_id_in_steps_range + // ^ | | | + // | __sp_left | __sp_right + // | | + // | __linear_id_in_steps_range + // We doesn't save the first diagonal into base diagonal's SP storage !!! + + std::size_t __diagonal_idx = __linear_id_in_steps_range / __nd_range_params.steps_between_two_base_diags; assert(__diagonal_idx < __nd_range_params.base_diag_count); @@ -424,7 +440,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{0, 0}; const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; - const bool __is_base_diagonal = __global_idx % __nd_range_params.steps_between_two_base_diags == 0; + const bool __is_base_diagonal = __linear_id_in_steps_range % __nd_range_params.steps_between_two_base_diags == 0; return __sp_right.first + __sp_right.second > 0 ? (!__is_base_diagonal @@ -508,16 +524,18 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name { DropViews __views(__dst, __data_area); - const auto __sp = __lookup_sp(__linear_id /* __global_idx */, __nd_range_params, - __data_area, __views, __comp, __base_diagonals_sp_global_ptr); + const auto __sp = + __lookup_sp(__linear_id /* __linear_id_in_steps_range */, __nd_range_params, + __data_area, __views, __comp, __base_diagonals_sp_global_ptr); __serial_merge_w(__nd_range_params, __data_area, __views, __rng, __sp, __comp); } else { DropViews __views(__rng, __data_area); - const auto __sp = __lookup_sp(__linear_id /* __global_idx */, __nd_range_params, - __data_area, __views, __comp, __base_diagonals_sp_global_ptr); + const auto __sp = + __lookup_sp(__linear_id /* __linear_id_in_steps_range */, __nd_range_params, + __data_area, __views, __comp, __base_diagonals_sp_global_ptr); __serial_merge_w(__nd_range_params, __data_area, __views, __dst, __sp, __comp); } } From f99afcaccf1ea33fb2608cdb703c885fdde4d41f Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 23 Dec 2024 00:09:27 +0100 Subject: [PATCH 076/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix capture modes in submit() calls Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index b7c88c8e892..e05ed42aa2c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -214,7 +214,7 @@ struct __merge_sort_leaf_submitter<__internal::__optional_kernel_name<_LeafSortN sycl::event operator()(sycl::queue& __q, _Range& __rng, _Compare __comp, _LeafSorter& __leaf_sorter) const { - return __q.submit([&](sycl::handler& __cgh) { + return __q.submit([&__rng, __comp, &__leaf_sorter](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng); auto __storage_acc = __leaf_sorter.create_storage_accessor(__cgh); const std::uint32_t __wg_count = @@ -374,7 +374,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name { const _IndexT __n = __rng.size(); - return __exec.queue().submit([&, __event_chain](sycl::handler& __cgh) { + return __exec.queue().submit([&__event_chain, __n_sorted, __data_in_temp, &__rng, &__temp_buf, __comp, + __nd_range_params, &__base_diagonals_sp_global_storage, __n](sycl::handler& __cgh) { __cgh.depends_on(__event_chain); oneapi::dpl::__ranges::__require_access(__cgh, __rng); @@ -459,7 +460,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name { const _IndexT __n = __rng.size(); - return __exec.queue().submit([&, __event_chain](sycl::handler& __cgh) { + return __exec.queue().submit([&__event_chain, __n_sorted, __data_in_temp, &__rng, &__temp_buf, __comp, + __nd_range_params, __n](sycl::handler& __cgh) { __cgh.depends_on(__event_chain); oneapi::dpl::__ranges::__require_access(__cgh, __rng); @@ -500,7 +502,9 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name { const _IndexT __n = __rng.size(); - return __exec.queue().submit([&, __event_chain](sycl::handler& __cgh) { + return __exec.queue().submit([&__event_chain, __n_sorted, __data_in_temp, &__rng, &__temp_buf, __comp, + __nd_range_params, &__base_diagonals_sp_global_storage, + __n](sycl::handler& __cgh) { __cgh.depends_on(__event_chain); oneapi::dpl::__ranges::__require_access(__cgh, __rng); @@ -621,7 +625,7 @@ struct __merge_sort_copy_back_submitter<__internal::__optional_kernel_name<_Copy sycl::event operator()(sycl::queue& __q, _Range& __rng, _TempBuf& __temp_buf, sycl::event __event_chain) const { - return __q.submit([&](sycl::handler& __cgh) { + return __q.submit([&__rng, &__temp_buf, &__event_chain](sycl::handler& __cgh) { __cgh.depends_on(__event_chain); oneapi::dpl::__ranges::__require_access(__cgh, __rng); auto __temp_acc = __temp_buf.template get_access(__cgh); From ec98e171a25c767301dec38e08ff9f2f3664c944 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 23 Dec 2024 00:10:27 +0100 Subject: [PATCH 077/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix self-review comment: refactoring of __temp_sp_storages creation in the __merge_sort_global_submitter::operator() Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index e05ed42aa2c..ddd244764c6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -574,7 +574,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Create container for storages with split-points on base diagonal // - each iteration should have their own container - __container_of_temp_storages_t __temp_sp_storages(std::max(__n_iter, (std::int64_t)0)); + __container_of_temp_storages_t __temp_sp_storages(std::max(__n_iter, 0)); for (std::int64_t __i = 0; __i < __n_iter; ++__i) { From 8dd89e8b21a7d7ed02ee7f56c124726ddfa0cd50 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 23 Dec 2024 00:11:55 +0100 Subject: [PATCH 078/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - remove extra static_cast in the __leaf_sorter::sort() Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index ddd244764c6..ca522d995ec 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -177,7 +177,7 @@ struct __leaf_sorter // 3. Sort on work-group level bool __data_in_temp = - __group_sorter.sort(__item, __storage_acc, __comp, static_cast(0), __adjusted_process_size, + __group_sorter.sort(__item, __storage_acc, __comp, std::uint32_t{0}, __adjusted_process_size, /*sorted per sub-group*/ __data_per_workitem, __data_per_workitem, __workgroup_size); // barrier is not needed here because of the barrier inside the sort method From 93e4dbd8bcebf2e8e4383d9198cb33afe41acd4d Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 23 Dec 2024 00:14:40 +0100 Subject: [PATCH 079/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix self-review comment: refactoring of __temp_sp_storages creation in the __merge_sort_global_submitter::operator() Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index ca522d995ec..cd7340dbb06 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -574,7 +574,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Create container for storages with split-points on base diagonal // - each iteration should have their own container - __container_of_temp_storages_t __temp_sp_storages(std::max(__n_iter, 0)); + __container_of_temp_storages_t __temp_sp_storages; for (std::int64_t __i = 0; __i < __n_iter; ++__i) { @@ -595,7 +595,9 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // - for current iteration auto __p_base_diagonals_sp_storage = new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params_this.base_diag_count); - __temp_sp_storages[__i].reset(__p_base_diagonals_sp_storage); + + // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. + __temp_sp_storages.emplace_back(static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_storage)); // Calculation of split-points on each base diagonal __event_chain = From ce3dc45c942e9689f1408bc81c986b0c5c35bd58 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 23 Dec 2024 09:14:50 +0100 Subject: [PATCH 080/144] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index cd7340dbb06..d1092aa88b0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -375,7 +375,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const _IndexT __n = __rng.size(); return __exec.queue().submit([&__event_chain, __n_sorted, __data_in_temp, &__rng, &__temp_buf, __comp, - __nd_range_params, &__base_diagonals_sp_global_storage, __n](sycl::handler& __cgh) { + __nd_range_params, &__base_diagonals_sp_global_storage, + __n](sycl::handler& __cgh) { __cgh.depends_on(__event_chain); oneapi::dpl::__ranges::__require_access(__cgh, __rng); @@ -441,7 +442,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{0, 0}; const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; - const bool __is_base_diagonal = __linear_id_in_steps_range % __nd_range_params.steps_between_two_base_diags == 0; + const bool __is_base_diagonal = + __linear_id_in_steps_range % __nd_range_params.steps_between_two_base_diags == 0; return __sp_right.first + __sp_right.second > 0 ? (!__is_base_diagonal @@ -597,7 +599,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params_this.base_diag_count); // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. - __temp_sp_storages.emplace_back(static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_storage)); + __temp_sp_storages.emplace_back( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_storage)); // Calculation of split-points on each base diagonal __event_chain = From c58325a219b6d412bc3af994da8849c6395a0e28 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 23 Dec 2024 10:04:09 +0100 Subject: [PATCH 081/144] Tune amount of the base diagonals Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index d1092aa88b0..d465c4eaec5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -330,6 +330,17 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } }; + std::size_t + tune_amount_of_base_diagonals(std::size_t __n_sorted, std::size_t __amount_of_base_diagonals) const + { + // Multiply work per item by a power of 2 to reach the desired number of iterations. + // __dpl_bit_ceil rounds the ratio up to the next power of 2. + const std::size_t __k = oneapi::dpl::__internal::__dpl_bit_ceil( + (std::size_t)std::ceil(256 * 1024 * 1024 / __n_sorted)); + + return oneapi::dpl::__internal::__dpl_ceiling_div(__amount_of_base_diagonals, __k); + } + // Calculate nd-range params template nd_range_params @@ -340,7 +351,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const _IndexT __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_size, __chunk); // TODO required to evaluate this value based on available SLM size for each work-group. - _IndexT __base_diag_count = 32 * 1'024; // 32 Kb + _IndexT __base_diag_count = tune_amount_of_base_diagonals(__rng_size, 32 * 1'024); // 32 Kb _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; From cb89d6eddc60ec461f685cc9f0b00a0a21890247 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 23 Dec 2024 10:13:29 +0100 Subject: [PATCH 082/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - avoid if statement inside Kernel's code Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 65 +++++++------------ 1 file changed, 22 insertions(+), 43 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index d465c4eaec5..551925cad45 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -43,15 +43,13 @@ struct __subgroup_bubble_sorter void sort(const _StorageAcc& __storage_acc, _Compare __comp, std::uint32_t __start, std::uint32_t __end) const { + using std::swap; + for (std::uint32_t i = __start; i < __end; ++i) { for (std::uint32_t j = __start + 1; j < __start + __end - i; ++j) { - if (__comp(__storage_acc[j], __storage_acc[j - 1])) - { - using std::swap; - swap(__storage_acc[j - 1], __storage_acc[j]); - } + __comp(__storage_acc[j], __storage_acc[j - 1]) ? swap(__storage_acc[j - 1], __storage_acc[j]) : void(); } } } @@ -485,23 +483,12 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const std::size_t __linear_id = __item_id.get_linear_id(); const WorkDataArea __data_area(__n, __n_sorted, __linear_id, __nd_range_params.chunk); - if (__data_area.is_i_elem_local_inside_merge_matrix()) - { - if (__data_in_temp) - { - DropViews __views(__dst, __data_area); - - const auto __sp = __find_start_point_w(__data_area, __views, __comp); - __serial_merge_w(__nd_range_params, __data_area, __views, __rng, __sp, __comp); - } - else - { - DropViews __views(__rng, __data_area); - - const auto __sp = __find_start_point_w(__data_area, __views, __comp); - __serial_merge_w(__nd_range_params, __data_area, __views, __dst, __sp, __comp); - } - } + + __data_area.is_i_elem_local_inside_merge_matrix() + ? (__data_in_temp + ? __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp), __comp) + : __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp), __comp)) + : void(); }); }); } @@ -535,27 +522,19 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const WorkDataArea __data_area(__n, __n_sorted, __linear_id, __nd_range_params.chunk); - if (__data_area.is_i_elem_local_inside_merge_matrix()) - { - if (__data_in_temp) - { - DropViews __views(__dst, __data_area); - - const auto __sp = - __lookup_sp(__linear_id /* __linear_id_in_steps_range */, __nd_range_params, - __data_area, __views, __comp, __base_diagonals_sp_global_ptr); - __serial_merge_w(__nd_range_params, __data_area, __views, __rng, __sp, __comp); - } - else - { - DropViews __views(__rng, __data_area); - - const auto __sp = - __lookup_sp(__linear_id /* __linear_id_in_steps_range */, __nd_range_params, - __data_area, __views, __comp, __base_diagonals_sp_global_ptr); - __serial_merge_w(__nd_range_params, __data_area, __views, __dst, __sp, __comp); - } - } + __data_area.is_i_elem_local_inside_merge_matrix() + ? (__data_in_temp + ? __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, + __lookup_sp(__linear_id, __nd_range_params, __data_area, + DropViews(__dst, __data_area), __comp, + __base_diagonals_sp_global_ptr), + __comp) + : __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, + __lookup_sp(__linear_id, __nd_range_params, __data_area, + DropViews(__rng, __data_area), __comp, + __base_diagonals_sp_global_ptr), + __comp)) + : void(); }); }); } From 14ec79318bdfebb97d9eca9ee11b750f985ca089 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 23 Dec 2024 10:29:52 +0100 Subject: [PATCH 083/144] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 551925cad45..d055a5ccea4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -333,8 +333,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name { // Multiply work per item by a power of 2 to reach the desired number of iterations. // __dpl_bit_ceil rounds the ratio up to the next power of 2. - const std::size_t __k = oneapi::dpl::__internal::__dpl_bit_ceil( - (std::size_t)std::ceil(256 * 1024 * 1024 / __n_sorted)); + const std::size_t __k = + oneapi::dpl::__internal::__dpl_bit_ceil((std::size_t)std::ceil(256 * 1024 * 1024 / __n_sorted)); return oneapi::dpl::__internal::__dpl_ceiling_div(__amount_of_base_diagonals, __k); } @@ -486,8 +486,12 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __data_area.is_i_elem_local_inside_merge_matrix() ? (__data_in_temp - ? __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp), __comp) - : __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp), __comp)) + ? __serial_merge_w( + __nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, + __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp), __comp) + : __serial_merge_w( + __nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, + __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp), __comp)) : void(); }); }); @@ -524,16 +528,16 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __data_area.is_i_elem_local_inside_merge_matrix() ? (__data_in_temp - ? __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, - __lookup_sp(__linear_id, __nd_range_params, __data_area, - DropViews(__dst, __data_area), __comp, - __base_diagonals_sp_global_ptr), - __comp) - : __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, - __lookup_sp(__linear_id, __nd_range_params, __data_area, - DropViews(__rng, __data_area), __comp, - __base_diagonals_sp_global_ptr), - __comp)) + ? __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, + __lookup_sp(__linear_id, __nd_range_params, __data_area, + DropViews(__dst, __data_area), __comp, + __base_diagonals_sp_global_ptr), + __comp) + : __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, + __lookup_sp(__linear_id, __nd_range_params, __data_area, + DropViews(__rng, __data_area), __comp, + __base_diagonals_sp_global_ptr), + __comp)) : void(); }); }); From 1e4ef41be5c627bb1b2326dac9fae34e23fd6ae4 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 13 Jan 2025 10:21:24 +0100 Subject: [PATCH 084/144] Update include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h Co-authored-by: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com> --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index d055a5ccea4..5b082a09a1f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -441,7 +441,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // | __sp_left | __sp_right // | | // | __linear_id_in_steps_range - // We doesn't save the first diagonal into base diagonal's SP storage !!! + // We don't save the first diagonal into base diagonal's SP storage !!! std::size_t __diagonal_idx = __linear_id_in_steps_range / __nd_range_params.steps_between_two_base_diags; From 703d74e9ee7a39cbef0f410758ae60c149005b73 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 13 Jan 2025 10:26:10 +0100 Subject: [PATCH 085/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: rename run_parallel_merge to run_parallel_merge_from_diagonals --- .../hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 5b082a09a1f..749120917bb 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -500,9 +500,10 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Process parallel merge with usage of split-points on base diagonals template sycl::event - run_parallel_merge(const sycl::event& __event_chain, const _IndexT __n_sorted, const bool __data_in_temp, - _ExecutionPolicy&& __exec, _Range&& __rng, _TempBuf& __temp_buf, _Compare __comp, - const nd_range_params& __nd_range_params, _Storage& __base_diagonals_sp_global_storage) const + run_parallel_merge_from_diagonals(const sycl::event& __event_chain, const _IndexT __n_sorted, + const bool __data_in_temp, _ExecutionPolicy&& __exec, _Range&& __rng, + _TempBuf& __temp_buf, _Compare __comp, const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const { const _IndexT __n = __rng.size(); @@ -602,8 +603,9 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __comp, __nd_range_params_this, *__p_base_diagonals_sp_storage); // Process parallel merge with usage of split-points on base diagonals - __event_chain = run_parallel_merge(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, - __comp, __nd_range_params_this, *__p_base_diagonals_sp_storage); + __event_chain = run_parallel_merge_from_diagonals(__event_chain, __n_sorted, __data_in_temp, __exec, + __rng, __temp_buf, __comp, __nd_range_params_this, + *__p_base_diagonals_sp_storage); } __n_sorted *= 2; From 6836fc08d4ff27ca0753d190d9f85648bddaf9ea Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 13 Jan 2025 15:44:51 +0100 Subject: [PATCH 086/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: using common base diagonal's container for all iterations with the enough big size --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 50 ++++++++++++------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 749120917bb..5c582b44ce6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -355,6 +355,28 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; } + template + std::size_t + get_max_base_diags_count(_ExecutionPolicy&& __exec, const std::int64_t __n_iter, const _IndexT __n, + _IndexT __n_sorted) const + { + std::size_t __max_base_diags_count = 0; + + for (std::int64_t __i = 0; __i < __n_iter; ++__i) + { + const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); + + nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted)); + + __max_base_diags_count = + std::max(__max_base_diags_count, __nd_range_params_this.base_diag_count * __portions); + + __n_sorted *= 2; + } + + return __max_base_diags_count; + } + template inline static _merge_split_point_t __find_start_point_w(const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp) @@ -545,10 +567,9 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } public: - using __container_of_temp_storages_t = std::vector>; template - std::tuple + std::tuple> operator()(_ExecutionPolicy&& __exec, _Range& __rng, _Compare __comp, _LeafSizeT __leaf_size, _TempBuf& __temp_buf, sycl::event __event_chain) const { @@ -569,9 +590,13 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // std::log2 may be prone to rounding errors on some architectures const std::int64_t __n_iter = sycl::ctz(__n_power2) - sycl::ctz(__leaf_size); - // Create container for storages with split-points on base diagonal - // - each iteration should have their own container - __container_of_temp_storages_t __temp_sp_storages; + // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + const std::size_t __max_base_diags_count = get_max_base_diags_count(__exec, __n_iter, __n, __n_sorted); + auto __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t(__exec, 0, __max_base_diags_count); + + // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. + std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); for (std::int64_t __i = 0; __i < __n_iter; ++__i) { @@ -588,31 +613,22 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __nd_range_params_this.steps *= __portions; __nd_range_params_this.base_diag_count *= __portions; - // Create storage for save split-points on each base diagonal - // - for current iteration - auto __p_base_diagonals_sp_storage = - new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params_this.base_diag_count); - - // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. - __temp_sp_storages.emplace_back( - static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_storage)); - // Calculation of split-points on each base diagonal __event_chain = eval_split_points_for_groups(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, - __comp, __nd_range_params_this, *__p_base_diagonals_sp_storage); + __comp, __nd_range_params_this, *__p_base_diagonals_sp_global_storage); // Process parallel merge with usage of split-points on base diagonals __event_chain = run_parallel_merge_from_diagonals(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, __comp, __nd_range_params_this, - *__p_base_diagonals_sp_storage); + *__p_base_diagonals_sp_global_storage); } __n_sorted *= 2; __data_in_temp = !__data_in_temp; } - return {std::move(__event_chain), __data_in_temp, std::move(__temp_sp_storages)}; + return {std::move(__event_chain), __data_in_temp, std::move(__p_result_and_scratch_storage_base)}; } }; From 8ecefdf39cd37cd0c00b24910d558d35b00069b7 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 13 Jan 2025 16:01:14 +0100 Subject: [PATCH 087/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: using common base diagonal's container for all iterations with the enough big size --- .../hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 5c582b44ce6..91e664b88ed 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -362,16 +362,15 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name { std::size_t __max_base_diags_count = 0; - for (std::int64_t __i = 0; __i < __n_iter; ++__i) + if (__n_iter > 0) { + __n_sorted = __n_sorted << (__n_iter - 1); + const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted)); - __max_base_diags_count = - std::max(__max_base_diags_count, __nd_range_params_this.base_diag_count * __portions); - - __n_sorted *= 2; + __max_base_diags_count = __nd_range_params_this.base_diag_count * __portions; } return __max_base_diags_count; @@ -613,6 +612,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __nd_range_params_this.steps *= __portions; __nd_range_params_this.base_diag_count *= __portions; + assert(__nd_range_params_this.base_diag_count <= __max_base_diags_count); + // Calculation of split-points on each base diagonal __event_chain = eval_split_points_for_groups(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, From 93e7b4746ff564822c6f5e24c9454a8a648beb22 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 13 Jan 2025 16:14:17 +0100 Subject: [PATCH 088/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: add __portions parameter into eval_nd_range_params --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 91e664b88ed..074c749f0c1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -342,7 +342,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Calculate nd-range params template nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size) const + eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size, const _IndexT __portions) const { const bool __is_cpu = __exec.queue().get_device().is_cpu(); const _IndexT __chunk = __is_cpu ? 32 : 4; @@ -352,7 +352,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name _IndexT __base_diag_count = tune_amount_of_base_diagonals(__rng_size, 32 * 1'024); // 32 Kb _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; + return {__base_diag_count * __portions, __steps_between_two_base_diags, __chunk, __steps * __portions}; } template @@ -367,10 +367,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __n_sorted = __n_sorted << (__n_iter - 1); const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); - - nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted)); - - __max_base_diags_count = __nd_range_params_this.base_diag_count * __portions; + __max_base_diags_count = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions).base_diag_count; } return __max_base_diags_count; @@ -580,7 +577,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name using __value_type = oneapi::dpl::__internal::__value_t<_Range>; // Calculate nd-range params - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n); + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n, 1); using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _merge_split_point_t>; @@ -608,9 +605,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name else { const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); - nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted)); - __nd_range_params_this.steps *= __portions; - __nd_range_params_this.base_diag_count *= __portions; + const nd_range_params __nd_range_params_this = + eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions); assert(__nd_range_params_this.base_diag_count <= __max_base_diags_count); From 2c14225d7f8f7b88092ef88b6a72f0747d891326 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 13 Jan 2025 16:28:04 +0100 Subject: [PATCH 089/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix self-review comment: using own __no_op impl --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 074c749f0c1..11076f6c91e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -480,6 +480,11 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name : __find_start_point_w(__data_area, __views, __comp); } + struct __no_op + { + void operator()() {} + }; + // Process parallel merge template sycl::event @@ -510,7 +515,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name : __serial_merge_w( __nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp), __comp)) - : void(); + : __no_op{}(); }); }); } @@ -557,7 +562,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name DropViews(__rng, __data_area), __comp, __base_diagonals_sp_global_ptr), __comp)) - : void(); + : __no_op{}(); }); }); } From d38ccaa66f762f08e9ccef6883af4a206f7a8407 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 13 Jan 2025 16:30:46 +0100 Subject: [PATCH 090/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - remove extra changes in __subgroup_bubble_sorter::sort() --- .../hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 11076f6c91e..3ecea4ecf12 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -43,13 +43,17 @@ struct __subgroup_bubble_sorter void sort(const _StorageAcc& __storage_acc, _Compare __comp, std::uint32_t __start, std::uint32_t __end) const { - using std::swap; - for (std::uint32_t i = __start; i < __end; ++i) { for (std::uint32_t j = __start + 1; j < __start + __end - i; ++j) { - __comp(__storage_acc[j], __storage_acc[j - 1]) ? swap(__storage_acc[j - 1], __storage_acc[j]) : void(); + auto& __first_item = __storage_acc[j - 1]; + auto& __second_item = __storage_acc[j]; + if (__comp(__second_item, __first_item)) + { + using std::swap; + swap(__first_item, __second_item); + } } } } From 6997891ee7cbbebdab608c1ae1ad2adf31548505 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 13 Jan 2025 16:31:53 +0100 Subject: [PATCH 091/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: using common base diagonal's container for all iterations with the enough big size --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 3ecea4ecf12..41fcb4d2267 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -22,7 +22,6 @@ #include // std::uint32_t, ... #include // std::min, std::max_element #include // std::decay_t, std::integral_constant -#include #include "sycl_defs.h" // __dpl_sycl::__local_accessor, __dpl_sycl::__group_barrier #include "sycl_traits.h" // SYCL traits specialization for some oneDPL types. From 15b6d139f1b0709da8b365af3ec3f5b0efdfc17e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 13 Jan 2025 16:44:44 +0100 Subject: [PATCH 092/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: remove unrequired ternary operators --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 51 +++++++++---------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 41fcb4d2267..6e19ef0ae69 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -483,11 +483,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name : __find_start_point_w(__data_area, __views, __comp); } - struct __no_op - { - void operator()() {} - }; - // Process parallel merge template sycl::event @@ -510,15 +505,16 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const WorkDataArea __data_area(__n, __n_sorted, __linear_id, __nd_range_params.chunk); - __data_area.is_i_elem_local_inside_merge_matrix() - ? (__data_in_temp - ? __serial_merge_w( - __nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, - __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp), __comp) - : __serial_merge_w( - __nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, - __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp), __comp)) - : __no_op{}(); + if (__data_area.is_i_elem_local_inside_merge_matrix()) + { + __data_in_temp + ? __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, + __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp), + __comp) + : __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, + __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp), + __comp); + } }); }); } @@ -553,19 +549,20 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const WorkDataArea __data_area(__n, __n_sorted, __linear_id, __nd_range_params.chunk); - __data_area.is_i_elem_local_inside_merge_matrix() - ? (__data_in_temp - ? __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, - __lookup_sp(__linear_id, __nd_range_params, __data_area, - DropViews(__dst, __data_area), __comp, - __base_diagonals_sp_global_ptr), - __comp) - : __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, - __lookup_sp(__linear_id, __nd_range_params, __data_area, - DropViews(__rng, __data_area), __comp, - __base_diagonals_sp_global_ptr), - __comp)) - : __no_op{}(); + if (__data_area.is_i_elem_local_inside_merge_matrix()) + { + __data_in_temp + ? __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, + __lookup_sp(__linear_id, __nd_range_params, __data_area, + DropViews(__dst, __data_area), __comp, + __base_diagonals_sp_global_ptr), + __comp) + : __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, + __lookup_sp(__linear_id, __nd_range_params, __data_area, + DropViews(__rng, __data_area), __comp, + __base_diagonals_sp_global_ptr), + __comp); + } }); }); } From 9c3bfdc45ac710d17878881c025d3c2c24aed05c Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 13 Jan 2025 18:09:18 +0100 Subject: [PATCH 093/144] Apply GitHUB clang format --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 6e19ef0ae69..69aaf08ca31 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -370,7 +370,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __n_sorted = __n_sorted << (__n_iter - 1); const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); - __max_base_diags_count = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions).base_diag_count; + __max_base_diags_count = + eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions).base_diag_count; } return __max_base_diags_count; @@ -593,7 +594,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) const std::size_t __max_base_diags_count = get_max_base_diags_count(__exec, __n_iter, __n, __n_sorted); - auto __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t(__exec, 0, __max_base_diags_count); + auto __p_base_diagonals_sp_global_storage = + new __base_diagonals_sp_storage_t(__exec, 0, __max_base_diags_count); // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base( From 32353dbb89f3d4ee3403f3a218d4a45aac015db8 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 14 Jan 2025 09:43:59 +0100 Subject: [PATCH 094/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comments: remove extra ternary operators --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 61 +++++++++++-------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 69aaf08ca31..d960971b5b6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -476,12 +476,15 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const bool __is_base_diagonal = __linear_id_in_steps_range % __nd_range_params.steps_between_two_base_diags == 0; - return __sp_right.first + __sp_right.second > 0 - ? (!__is_base_diagonal - ? __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, - __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp) - : __sp_left) - : __find_start_point_w(__data_area, __views, __comp); + if (__sp_right.first + __sp_right.second > 0) + { + if (!__is_base_diagonal) + return __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, + __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp); + return __sp_left; + } + + return __find_start_point_w(__data_area, __views, __comp); } // Process parallel merge @@ -508,13 +511,18 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name if (__data_area.is_i_elem_local_inside_merge_matrix()) { - __data_in_temp - ? __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, - __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp), - __comp) - : __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, - __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp), - __comp); + if (__data_in_temp) + { + __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, + __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp), + __comp); + } + else + { + __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, + __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp), + __comp); + } } }); }); @@ -552,17 +560,22 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name if (__data_area.is_i_elem_local_inside_merge_matrix()) { - __data_in_temp - ? __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, - __lookup_sp(__linear_id, __nd_range_params, __data_area, - DropViews(__dst, __data_area), __comp, - __base_diagonals_sp_global_ptr), - __comp) - : __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, - __lookup_sp(__linear_id, __nd_range_params, __data_area, - DropViews(__rng, __data_area), __comp, - __base_diagonals_sp_global_ptr), - __comp); + if (__data_in_temp) + { + __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, + __lookup_sp(__linear_id, __nd_range_params, __data_area, + DropViews(__dst, __data_area), __comp, + __base_diagonals_sp_global_ptr), + __comp); + } + else + { + __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, + __lookup_sp(__linear_id, __nd_range_params, __data_area, + DropViews(__rng, __data_area), __comp, + __base_diagonals_sp_global_ptr), + __comp); + } } }); }); From f007f3f01b295956c3571fbe7e717f50c4361811 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 14 Jan 2025 09:55:27 +0100 Subject: [PATCH 095/144] Apply GitHUB clang format --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index d960971b5b6..e3c05caa5b6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -582,7 +582,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } public: - template std::tuple> operator()(_ExecutionPolicy&& __exec, _Range& __rng, _Compare __comp, _LeafSizeT __leaf_size, _TempBuf& __temp_buf, From 7e4df7fb4bef63022ec5a980081a51e092ea05ce Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 14 Jan 2025 14:29:08 +0100 Subject: [PATCH 096/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - pack __amount_of_base_diagonals into tune_amount_of_base_diagonals --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index e3c05caa5b6..80c41956a7a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -332,8 +332,11 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name }; std::size_t - tune_amount_of_base_diagonals(std::size_t __n_sorted, std::size_t __amount_of_base_diagonals) const + tune_amount_of_base_diagonals(std::size_t __n_sorted) const { + // TODO required to evaluate this value based on available SLM size for each work-group. + constexpr std::size_t __amount_of_base_diagonals = 32 * 1'024; // 32 Kb + // Multiply work per item by a power of 2 to reach the desired number of iterations. // __dpl_bit_ceil rounds the ratio up to the next power of 2. const std::size_t __k = @@ -351,8 +354,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const _IndexT __chunk = __is_cpu ? 32 : 4; const _IndexT __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_size, __chunk); - // TODO required to evaluate this value based on available SLM size for each work-group. - _IndexT __base_diag_count = tune_amount_of_base_diagonals(__rng_size, 32 * 1'024); // 32 Kb + _IndexT __base_diag_count = tune_amount_of_base_diagonals(__rng_size); _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); return {__base_diag_count * __portions, __steps_between_two_base_diags, __chunk, __steps * __portions}; From abfe36b858f95dfe40e61ab5cf06c65020b2d1cf Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 14 Jan 2025 14:32:29 +0100 Subject: [PATCH 097/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - pack __amount_of_base_diagonals into tune_amount_of_base_diagonals --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 80c41956a7a..6fabe27375e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -335,14 +335,14 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name tune_amount_of_base_diagonals(std::size_t __n_sorted) const { // TODO required to evaluate this value based on available SLM size for each work-group. - constexpr std::size_t __amount_of_base_diagonals = 32 * 1'024; // 32 Kb + const _IdType __base_diag_count = 32 * 1'024; // Multiply work per item by a power of 2 to reach the desired number of iterations. // __dpl_bit_ceil rounds the ratio up to the next power of 2. const std::size_t __k = oneapi::dpl::__internal::__dpl_bit_ceil((std::size_t)std::ceil(256 * 1024 * 1024 / __n_sorted)); - return oneapi::dpl::__internal::__dpl_ceiling_div(__amount_of_base_diagonals, __k); + return oneapi::dpl::__internal::__dpl_ceiling_div(__base_diag_count, __k); } // Calculate nd-range params From 1b07725338187038735bd1aaa40c0e5219ed6a36 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 14 Jan 2025 14:50:17 +0100 Subject: [PATCH 098/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - pack __amount_of_base_diagonals into tune_amount_of_base_diagonals (fix compile error) --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 6fabe27375e..b27291c2fb0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -335,7 +335,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name tune_amount_of_base_diagonals(std::size_t __n_sorted) const { // TODO required to evaluate this value based on available SLM size for each work-group. - const _IdType __base_diag_count = 32 * 1'024; + const std::size_t __base_diag_count = 32 * 1'024; // Multiply work per item by a power of 2 to reach the desired number of iterations. // __dpl_bit_ceil rounds the ratio up to the next power of 2. From 5026506d34f486952844d7fa3802936aa26b8277 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 14 Jan 2025 14:53:25 +0100 Subject: [PATCH 099/144] Apply GitHUB clang format --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index b27291c2fb0..f481bff47a0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -336,7 +336,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name { // TODO required to evaluate this value based on available SLM size for each work-group. const std::size_t __base_diag_count = 32 * 1'024; - + // Multiply work per item by a power of 2 to reach the desired number of iterations. // __dpl_bit_ceil rounds the ratio up to the next power of 2. const std::size_t __k = From 849a45d69c3a1ddb4f448180444d80d020f9c811 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 15 Jan 2025 10:21:50 +0100 Subject: [PATCH 100/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: remove assertion from the Kernel code --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index f481bff47a0..2c5ace0aa78 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -469,8 +469,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name std::size_t __diagonal_idx = __linear_id_in_steps_range / __nd_range_params.steps_between_two_base_diags; - assert(__diagonal_idx < __nd_range_params.base_diag_count); - const _merge_split_point_t __sp_left = __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{0, 0}; const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; From e24628f7659015ed2040002bc287e575e49d321a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 15 Jan 2025 13:02:36 +0100 Subject: [PATCH 101/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: using __dpl_ceiling_div instead of std::ceil --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 2c5ace0aa78..aefdb0368ec 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -339,8 +339,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Multiply work per item by a power of 2 to reach the desired number of iterations. // __dpl_bit_ceil rounds the ratio up to the next power of 2. - const std::size_t __k = - oneapi::dpl::__internal::__dpl_bit_ceil((std::size_t)std::ceil(256 * 1024 * 1024 / __n_sorted)); + const std::size_t __k = oneapi::dpl::__internal::__dpl_bit_ceil( + oneapi::dpl::__internal::__dpl_ceiling_div(256 * 1024 * 1024, __n_sorted)); return oneapi::dpl::__internal::__dpl_ceiling_div(__base_diag_count, __k); } From 535ba7ab53e5095b902913c82bd2c7c119e64c0a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 16 Jan 2025 12:52:13 +0100 Subject: [PATCH 102/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - creates __base_diagonals_sp_storage_t before the first real usage --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index aefdb0368ec..a1bc17631d1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -604,14 +604,11 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // std::log2 may be prone to rounding errors on some architectures const std::int64_t __n_iter = sycl::ctz(__n_power2) - sycl::ctz(__leaf_size); - // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - const std::size_t __max_base_diags_count = get_max_base_diags_count(__exec, __n_iter, __n, __n_sorted); - auto __p_base_diagonals_sp_global_storage = - new __base_diagonals_sp_storage_t(__exec, 0, __max_base_diags_count); + // Storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + __base_diagonals_sp_storage_t* __p_base_diagonals_sp_global_storage = nullptr; - // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. - std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base( - static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + // shared_ptr instance to return it in __future and extend the lifetime of the storage. + std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base; for (std::int64_t __i = 0; __i < __n_iter; ++__i) { @@ -623,6 +620,19 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } else { + if (nullptr == __p_base_diagonals_sp_global_storage) + { + // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + const std::size_t __max_base_diags_count = + get_max_base_diags_count(__exec, __n_iter, __n, __n_sorted); + __p_base_diagonals_sp_global_storage = + new __base_diagonals_sp_storage_t(__exec, 0, __max_base_diags_count); + + // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. + __p_result_and_scratch_storage_base.reset( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + } + const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); const nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions); From ff14ffe598bdf54860efe158054294cdeddb0017 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 16 Jan 2025 12:53:34 +0100 Subject: [PATCH 103/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - remove extra local variable __max_base_diags_count --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index a1bc17631d1..3a9340a8c1c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -623,10 +623,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name if (nullptr == __p_base_diagonals_sp_global_storage) { // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - const std::size_t __max_base_diags_count = - get_max_base_diags_count(__exec, __n_iter, __n, __n_sorted); - __p_base_diagonals_sp_global_storage = - new __base_diagonals_sp_storage_t(__exec, 0, __max_base_diags_count); + __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t( + __exec, 0, get_max_base_diags_count(__exec, __n_iter, __n, __n_sorted)); // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. __p_result_and_scratch_storage_base.reset( @@ -637,8 +635,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions); - assert(__nd_range_params_this.base_diag_count <= __max_base_diags_count); - // Calculation of split-points on each base diagonal __event_chain = eval_split_points_for_groups(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, From fae5314906b5e47457e3fb42eb2d89209123ce48 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 16 Jan 2025 13:05:39 +0100 Subject: [PATCH 104/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - refactoring of __lookup_sp --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 3a9340a8c1c..f0ed156ec94 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -478,10 +478,10 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name if (__sp_right.first + __sp_right.second > 0) { - if (!__is_base_diagonal) - return __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, - __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp); - return __sp_left; + return __is_base_diagonal + ? __sp_left + : __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, + __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp); } return __find_start_point_w(__data_area, __views, __comp); From bdb68d3dd6f59b99689ed3ec0b1b8f0f9ed53328 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 17 Jan 2025 10:06:40 +0100 Subject: [PATCH 105/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - Simple using 32 Kb of base diagonals for every merging part + calc max amount of base diagonals --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 65 ++++++++----------- 1 file changed, 26 insertions(+), 39 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index f0ed156ec94..3c48c5d4210 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -331,20 +331,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } }; - std::size_t - tune_amount_of_base_diagonals(std::size_t __n_sorted) const - { - // TODO required to evaluate this value based on available SLM size for each work-group. - const std::size_t __base_diag_count = 32 * 1'024; - - // Multiply work per item by a power of 2 to reach the desired number of iterations. - // __dpl_bit_ceil rounds the ratio up to the next power of 2. - const std::size_t __k = oneapi::dpl::__internal::__dpl_bit_ceil( - oneapi::dpl::__internal::__dpl_ceiling_div(256 * 1024 * 1024, __n_sorted)); - - return oneapi::dpl::__internal::__dpl_ceiling_div(__base_diag_count, __k); - } - // Calculate nd-range params template nd_range_params @@ -354,31 +340,13 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const _IndexT __chunk = __is_cpu ? 32 : 4; const _IndexT __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_size, __chunk); - _IndexT __base_diag_count = tune_amount_of_base_diagonals(__rng_size); + // TODO required to evaluate this value based on available SLM size for each work-group. + _IndexT __base_diag_count = 32 * 1'024; _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); return {__base_diag_count * __portions, __steps_between_two_base_diags, __chunk, __steps * __portions}; } - template - std::size_t - get_max_base_diags_count(_ExecutionPolicy&& __exec, const std::int64_t __n_iter, const _IndexT __n, - _IndexT __n_sorted) const - { - std::size_t __max_base_diags_count = 0; - - if (__n_iter > 0) - { - __n_sorted = __n_sorted << (__n_iter - 1); - - const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); - __max_base_diags_count = - eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions).base_diag_count; - } - - return __max_base_diags_count; - } - template inline static _merge_split_point_t __find_start_point_w(const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp) @@ -610,6 +578,27 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // shared_ptr instance to return it in __future and extend the lifetime of the storage. std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base; + // To calculate nd_range_params for specified iteration + auto fnc_eval_nd_range = [this](_ExecutionPolicy& __exec, const _IndexT __n, _IndexT __n_sorted) + { + const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); + return eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions); + }; + + // To calculate maximal amount of base diagonals from specified iteration till the end iteration + auto fnc_max_base_diags_count = [fnc_eval_nd_range](_ExecutionPolicy& __exec, const _IndexT __n, + _IndexT __n_sorted, std::int64_t __i, + std::int64_t __n_iter) { + std::size_t __max_base_diags_count = 0; + for (; __i < __n_iter; ++__i) + { + __max_base_diags_count = + std::max(__max_base_diags_count, fnc_eval_nd_range(__exec, __n, __n_sorted).base_diag_count); + __n_sorted *= 2; + } + return __max_base_diags_count; + }; + for (std::int64_t __i = 0; __i < __n_iter; ++__i) { if (2 * __n_sorted < __get_starting_size_limit_for_large_submitter<__value_type>()) @@ -623,17 +612,15 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name if (nullptr == __p_base_diagonals_sp_global_storage) { // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t( - __exec, 0, get_max_base_diags_count(__exec, __n_iter, __n, __n_sorted)); + const std::size_t __max_base_diags_count = fnc_max_base_diags_count(__exec, __n, __n_sorted, __i, __n_iter); + __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t(__exec, 0, __max_base_diags_count); // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. __p_result_and_scratch_storage_base.reset( static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); } - const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); - const nd_range_params __nd_range_params_this = - eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions); + const nd_range_params __nd_range_params_this = fnc_eval_nd_range(__exec, __n, __n_sorted); // Calculation of split-points on each base diagonal __event_chain = From ad1d290e9d257359532c17d35ba31899de9f1b61 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 17 Jan 2025 10:07:23 +0100 Subject: [PATCH 106/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - trace work with base diagonals --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 36 ++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 3c48c5d4210..b1f855e3c92 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -29,6 +29,21 @@ #include "../../utils_ranges.h" // __difference_t #include "parallel_backend_sycl_merge.h" // __find_start_point, __serial_merge +// TODO remove before merge into main branch +#define TRACE_BASE_DIAGS 0 + +#if TRACE_BASE_DIAGS +# ifdef __SYCL_DEVICE_ONLY__ +# define __SYCL_CONSTANT_AS __attribute__((opencl_constant)) +# else +# define __SYCL_CONSTANT_AS +# endif + +const __SYCL_CONSTANT_AS char fmt_create_diags_storage[] = "Create base diagonals storage: __n_sorted = %8d, storage size = [%8d]\n"; +const __SYCL_CONSTANT_AS char fmt_eval_base_diag[] = "Evaluate base diag: __n_sorted = %8d, __data_in_temp = %s, sp[%8d] = (%8d, %8d)\n"; +const __SYCL_CONSTANT_AS char fmt_lookup_sp[] = "Lookup sp: __linear_id = %8d, sp_left[%8d] = (%8d, %8d), sp_right[%8d] = (%8d, %8d)\n"; +#endif + namespace oneapi { namespace dpl @@ -404,12 +419,18 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __nd_range_params.chunk * __nd_range_params.steps_between_two_base_diags); - __base_diagonals_sp_global_ptr[__linear_id] = + const auto __sp = __data_area.is_i_elem_local_inside_merge_matrix() ? (__data_in_temp ? __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp) : __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp)) : _merge_split_point_t{__data_area.n1, __data_area.n2}; + +#if TRACE_BASE_DIAGS + sycl::ext::oneapi::experimental::printf(fmt_eval_base_diag, __n_sorted, __data_in_temp ? "T" : "F", __linear_id, __sp.first, __sp.second); +#endif + + __base_diagonals_sp_global_ptr[__linear_id] = __sp; }); }); } @@ -435,12 +456,15 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // | __linear_id_in_steps_range // We don't save the first diagonal into base diagonal's SP storage !!! - std::size_t __diagonal_idx = __linear_id_in_steps_range / __nd_range_params.steps_between_two_base_diags; + const std::size_t __diagonal_idx = __linear_id_in_steps_range / __nd_range_params.steps_between_two_base_diags; - const _merge_split_point_t __sp_left = - __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{0, 0}; + const _merge_split_point_t __sp_left = __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{0, 0}; const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; +#if TRACE_BASE_DIAGS + sycl::ext::oneapi::experimental::printf(fmt_lookup_sp, __linear_id_in_steps_range, __diagonal_idx - 1, __sp_left.first, __sp_left.second, __diagonal_idx, __sp_right.first, __sp_right.second); +#endif + const bool __is_base_diagonal = __linear_id_in_steps_range % __nd_range_params.steps_between_two_base_diags == 0; @@ -615,6 +639,10 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const std::size_t __max_base_diags_count = fnc_max_base_diags_count(__exec, __n, __n_sorted, __i, __n_iter); __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t(__exec, 0, __max_base_diags_count); +#if TRACE_BASE_DIAGS + sycl::ext::oneapi::experimental::printf(fmt_create_diags_storage, __n_sorted, __max_base_diags_count); +#endif + // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. __p_result_and_scratch_storage_base.reset( static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); From 991f32ab6348e1d0d85bfb36530c09e988ebde36 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 17 Jan 2025 11:52:57 +0100 Subject: [PATCH 107/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - trace Kernel's execution time info --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index b1f855e3c92..434275f7445 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -44,6 +44,13 @@ const __SYCL_CONSTANT_AS char fmt_eval_base_diag[] = "Evaluate base diag: __n_so const __SYCL_CONSTANT_AS char fmt_lookup_sp[] = "Lookup sp: __linear_id = %8d, sp_left[%8d] = (%8d, %8d), sp_right[%8d] = (%8d, %8d)\n"; #endif +#define MERGE_SORT_DISPLAY_STATISTIC 1 +#define MERGE_SORT_EXCLUDE_NEW_IMPL 0 + +#if MERGE_SORT_DISPLAY_STATISTIC +#include +#endif + namespace oneapi { namespace dpl @@ -625,14 +632,26 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name for (std::int64_t __i = 0; __i < __n_iter; ++__i) { +#if MERGE_SORT_DISPLAY_STATISTIC + const auto __start_time = std::chrono::high_resolution_clock::now(); + bool __new_impl_ran = false; +#endif + +#if !MERGE_SORT_EXCLUDE_NEW_IMPL if (2 * __n_sorted < __get_starting_size_limit_for_large_submitter<__value_type>()) +#endif { // Process parallel merge __event_chain = run_parallel_merge(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, __comp, __nd_range_params); } +#if !MERGE_SORT_EXCLUDE_NEW_IMPL else { +#if MERGE_SORT_DISPLAY_STATISTIC + __new_impl_ran = true; +#endif + if (nullptr == __p_base_diagonals_sp_global_storage) { // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) @@ -660,6 +679,16 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __rng, __temp_buf, __comp, __nd_range_params_this, *__p_base_diagonals_sp_global_storage); } +#endif + +#if MERGE_SORT_DISPLAY_STATISTIC + __event_chain.wait_and_throw(); + const auto __stop_time = std::chrono::high_resolution_clock::now(); + const auto __elapsed = __stop_time - __start_time; + if (__new_impl_ran) + std::cout << "(N) "; + std::cout << "iteration: " << __i << " __n_sorted = " << __n_sorted << ", time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; +#endif __n_sorted *= 2; __data_in_temp = !__data_in_temp; @@ -732,9 +761,20 @@ __merge_sort(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSo sycl::queue __q = __exec.queue(); +#if MERGE_SORT_DISPLAY_STATISTIC + const auto __start_time = std::chrono::high_resolution_clock::now(); +#endif + // 1. Perform sorting of the leaves of the merge sort tree sycl::event __event_leaf_sort = __merge_sort_leaf_submitter<_LeafSortKernel>()(__q, __rng, __comp, __leaf_sorter); +#if MERGE_SORT_DISPLAY_STATISTIC + __event_leaf_sort.wait_and_throw(); + const auto __stop_time = std::chrono::high_resolution_clock::now(); + const auto __elapsed = __stop_time - __start_time; + std::cout << "leaf sorter: time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; +#endif + // 2. Merge sorting oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, _Tp> __temp(__exec, __rng.size()); auto __temp_buf = __temp.get_buffer(); From 9de4d3cca3ee5bd12186613cb1542a8cdce6f744 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 09:35:09 +0100 Subject: [PATCH 108/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - declare local variables outside of the loop in __serial_merge --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index b1a775dd8f4..9ebd6c4b42b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -148,10 +148,13 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _I _Index __rng1_idx = __start1; _Index __rng2_idx = __start2; + bool __rng1_idx_less_n1 = false; + bool __rng2_idx_less_n2 = false; + for (_Index __rng3_idx = __start3; __rng3_idx < __rng3_idx_end; ++__rng3_idx) { - const bool __rng1_idx_less_n1 = __rng1_idx < __rng1_idx_end; - const bool __rng2_idx_less_n2 = __rng2_idx < __rng2_idx_end; + __rng1_idx_less_n1 = __rng1_idx < __rng1_idx_end; + __rng2_idx_less_n2 = __rng2_idx < __rng2_idx_end; // One of __rng1_idx_less_n1 and __rng2_idx_less_n2 should be true here // because 1) we should fill output data with elements from one of the input ranges From bba58f28c920da42ae29f4b6a0e610ae9a997d51 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 09:49:02 +0100 Subject: [PATCH 109/144] Revert "include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - Simple using 32 Kb of base diagonals for every merging part + calc max amount of base diagonals" This reverts commit bdb68d3dd6f59b99689ed3ec0b1b8f0f9ed53328. --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 65 +++++++++++-------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 434275f7445..ef03a35a20d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -353,6 +353,20 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } }; + std::size_t + tune_amount_of_base_diagonals(std::size_t __n_sorted) const + { + // TODO required to evaluate this value based on available SLM size for each work-group. + const std::size_t __base_diag_count = 32 * 1'024; + + // Multiply work per item by a power of 2 to reach the desired number of iterations. + // __dpl_bit_ceil rounds the ratio up to the next power of 2. + const std::size_t __k = oneapi::dpl::__internal::__dpl_bit_ceil( + oneapi::dpl::__internal::__dpl_ceiling_div(256 * 1024 * 1024, __n_sorted)); + + return oneapi::dpl::__internal::__dpl_ceiling_div(__base_diag_count, __k); + } + // Calculate nd-range params template nd_range_params @@ -362,13 +376,31 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const _IndexT __chunk = __is_cpu ? 32 : 4; const _IndexT __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_size, __chunk); - // TODO required to evaluate this value based on available SLM size for each work-group. - _IndexT __base_diag_count = 32 * 1'024; + _IndexT __base_diag_count = tune_amount_of_base_diagonals(__rng_size); _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); return {__base_diag_count * __portions, __steps_between_two_base_diags, __chunk, __steps * __portions}; } + template + std::size_t + get_max_base_diags_count(_ExecutionPolicy&& __exec, const std::int64_t __n_iter, const _IndexT __n, + _IndexT __n_sorted) const + { + std::size_t __max_base_diags_count = 0; + + if (__n_iter > 0) + { + __n_sorted = __n_sorted << (__n_iter - 1); + + const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); + __max_base_diags_count = + eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions).base_diag_count; + } + + return __max_base_diags_count; + } + template inline static _merge_split_point_t __find_start_point_w(const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp) @@ -609,27 +641,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // shared_ptr instance to return it in __future and extend the lifetime of the storage. std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base; - // To calculate nd_range_params for specified iteration - auto fnc_eval_nd_range = [this](_ExecutionPolicy& __exec, const _IndexT __n, _IndexT __n_sorted) - { - const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); - return eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions); - }; - - // To calculate maximal amount of base diagonals from specified iteration till the end iteration - auto fnc_max_base_diags_count = [fnc_eval_nd_range](_ExecutionPolicy& __exec, const _IndexT __n, - _IndexT __n_sorted, std::int64_t __i, - std::int64_t __n_iter) { - std::size_t __max_base_diags_count = 0; - for (; __i < __n_iter; ++__i) - { - __max_base_diags_count = - std::max(__max_base_diags_count, fnc_eval_nd_range(__exec, __n, __n_sorted).base_diag_count); - __n_sorted *= 2; - } - return __max_base_diags_count; - }; - for (std::int64_t __i = 0; __i < __n_iter; ++__i) { #if MERGE_SORT_DISPLAY_STATISTIC @@ -655,8 +666,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name if (nullptr == __p_base_diagonals_sp_global_storage) { // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - const std::size_t __max_base_diags_count = fnc_max_base_diags_count(__exec, __n, __n_sorted, __i, __n_iter); - __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t(__exec, 0, __max_base_diags_count); + __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t( + __exec, 0, get_max_base_diags_count(__exec, __n_iter, __n, __n_sorted)); #if TRACE_BASE_DIAGS sycl::ext::oneapi::experimental::printf(fmt_create_diags_storage, __n_sorted, __max_base_diags_count); @@ -667,7 +678,9 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); } - const nd_range_params __nd_range_params_this = fnc_eval_nd_range(__exec, __n, __n_sorted); + const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); + const nd_range_params __nd_range_params_this = + eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions); // Calculation of split-points on each base diagonal __event_chain = From 9e068267ef87cb3f4b19e2c0c43c32be2fee20da Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 10:09:51 +0100 Subject: [PATCH 110/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - calcs get_max_base_diags_count --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 50 ++++++++----------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index ef03a35a20d..a77ce90c5fb 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -353,16 +353,23 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } }; + // Return max number of base diagonals: + // 32 Kb of base diagonals for 256 Mb of source data + // 32 Kb / (256 Mb / __n) - for __n elements of source data std::size_t - tune_amount_of_base_diagonals(std::size_t __n_sorted) const + get_max_base_diags_count(std::size_t __n) const { + constexpr std::size_t __max_data_size = 256 * 1024 * 1024; // 256 Mb + + assert(__n <= __max_data_size); + // TODO required to evaluate this value based on available SLM size for each work-group. const std::size_t __base_diag_count = 32 * 1'024; // Multiply work per item by a power of 2 to reach the desired number of iterations. // __dpl_bit_ceil rounds the ratio up to the next power of 2. const std::size_t __k = oneapi::dpl::__internal::__dpl_bit_ceil( - oneapi::dpl::__internal::__dpl_ceiling_div(256 * 1024 * 1024, __n_sorted)); + oneapi::dpl::__internal::__dpl_ceiling_div(__max_data_size, __n)); return oneapi::dpl::__internal::__dpl_ceiling_div(__base_diag_count, __k); } @@ -370,7 +377,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Calculate nd-range params template nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size, const _IndexT __portions) const + eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size) const { const bool __is_cpu = __exec.queue().get_device().is_cpu(); const _IndexT __chunk = __is_cpu ? 32 : 4; @@ -379,26 +386,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name _IndexT __base_diag_count = tune_amount_of_base_diagonals(__rng_size); _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - return {__base_diag_count * __portions, __steps_between_two_base_diags, __chunk, __steps * __portions}; - } - - template - std::size_t - get_max_base_diags_count(_ExecutionPolicy&& __exec, const std::int64_t __n_iter, const _IndexT __n, - _IndexT __n_sorted) const - { - std::size_t __max_base_diags_count = 0; - - if (__n_iter > 0) - { - __n_sorted = __n_sorted << (__n_iter - 1); - - const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); - __max_base_diags_count = - eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions).base_diag_count; - } - - return __max_base_diags_count; + return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; } template @@ -626,7 +614,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name using __value_type = oneapi::dpl::__internal::__value_t<_Range>; // Calculate nd-range params - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n, 1); + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n); using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _merge_split_point_t>; @@ -641,6 +629,9 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // shared_ptr instance to return it in __future and extend the lifetime of the storage. std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base; + // Max amount of base diagonals + const std::size_t __max_base_diags_count = get_max_base_diags_count(__n); + for (std::int64_t __i = 0; __i < __n_iter; ++__i) { #if MERGE_SORT_DISPLAY_STATISTIC @@ -666,8 +657,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name if (nullptr == __p_base_diagonals_sp_global_storage) { // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t( - __exec, 0, get_max_base_diags_count(__exec, __n_iter, __n, __n_sorted)); + __p_base_diagonals_sp_global_storage = + new __base_diagonals_sp_storage_t(__exec, 0, __max_base_diags_count); #if TRACE_BASE_DIAGS sycl::ext::oneapi::experimental::printf(fmt_create_diags_storage, __n_sorted, __max_base_diags_count); @@ -678,9 +669,12 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); } + nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted)); + const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); - const nd_range_params __nd_range_params_this = - eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions); + __nd_range_params_this.base_diag_count *= __portions; + __nd_range_params_this.steps *= __portions; + assert(__nd_range_params_this.base_diag_count <= __max_base_diags_count); // Calculation of split-points on each base diagonal __event_chain = From 6b186fbcd0003d5cf6e022472bbb49a52a8fa4e2 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 10:10:41 +0100 Subject: [PATCH 111/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - trace Kernel's execution time info --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index a77ce90c5fb..743345df354 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -637,6 +637,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name #if MERGE_SORT_DISPLAY_STATISTIC const auto __start_time = std::chrono::high_resolution_clock::now(); bool __new_impl_ran = false; + std::size_t __base_diags_count_evaluated = 0; + std::size_t __base_diags_count_evaluated_for_each_2_n_sorted = 0; #endif #if !MERGE_SORT_EXCLUDE_NEW_IMPL @@ -650,7 +652,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name #if !MERGE_SORT_EXCLUDE_NEW_IMPL else { -#if MERGE_SORT_DISPLAY_STATISTIC +#if MERGE_SORT_DISPLAY_STATISTIC __new_impl_ran = true; #endif @@ -670,12 +672,19 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted)); +#if MERGE_SORT_DISPLAY_STATISTIC + __base_diags_count_evaluated_for_each_2_n_sorted = __nd_range_params_this.base_diag_count; +#endif const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); __nd_range_params_this.base_diag_count *= __portions; __nd_range_params_this.steps *= __portions; assert(__nd_range_params_this.base_diag_count <= __max_base_diags_count); +#if MERGE_SORT_DISPLAY_STATISTIC + __base_diags_count_evaluated = __nd_range_params_this.base_diag_count; +#endif + // Calculation of split-points on each base diagonal __event_chain = eval_split_points_for_groups(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, @@ -694,7 +703,10 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const auto __elapsed = __stop_time - __start_time; if (__new_impl_ran) std::cout << "(N) "; - std::cout << "iteration: " << __i << " __n_sorted = " << __n_sorted << ", time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; + std::cout << "iteration: " << __i << " __n_sorted = " << __n_sorted << ", time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) "; + if (__new_impl_ran) + std::cout << "base diags for every merge matrix: " << __base_diags_count_evaluated_for_each_2_n_sorted << ", all base_diags = " << __base_diags_count_evaluated; + std::cout << std::endl; #endif __n_sorted *= 2; From bdcff5e424e908ccd6fea8f332b60096c99a8f86 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 10:14:30 +0100 Subject: [PATCH 112/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - remove duplicated creation of DropViews --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 743345df354..e71297578c6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -532,15 +532,15 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name { if (__data_in_temp) { - __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, - __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp), - __comp); + DropViews __views(__dst, __data_area); + __serial_merge_w(__nd_range_params, __data_area, __views, __rng, + __find_start_point_w(__data_area, __views, __comp), __comp); } else { - __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, - __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp), - __comp); + DropViews __views(__rng, __data_area); + __serial_merge_w(__nd_range_params, __data_area, __views, __dst, + __find_start_point_w(__data_area, __views, __comp), __comp); } } }); @@ -581,17 +581,17 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name { if (__data_in_temp) { - __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, - __lookup_sp(__linear_id, __nd_range_params, __data_area, - DropViews(__dst, __data_area), __comp, + DropViews __views(__dst, __data_area); + __serial_merge_w(__nd_range_params, __data_area, __views, __rng, + __lookup_sp(__linear_id, __nd_range_params, __data_area, __views, __comp, __base_diagonals_sp_global_ptr), __comp); } else { - __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, - __lookup_sp(__linear_id, __nd_range_params, __data_area, - DropViews(__rng, __data_area), __comp, + DropViews __views(__rng, __data_area); + __serial_merge_w(__nd_range_params, __data_area, __views, __dst, + __lookup_sp(__linear_id, __nd_range_params, __data_area, __views, __comp, __base_diagonals_sp_global_ptr), __comp); } From 984e7cfcf99cdce2fb8e420e6d8d1039681ddfb6 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 10:16:44 +0100 Subject: [PATCH 113/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix comment in __lookup_sp --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index e71297578c6..def3fb456c1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -477,10 +477,10 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // ^ ^ ^ ^ ^ ^ ^ | ^ ^ ^ ^ ^ // --- 0 1 2 3 4 5 6 | 7 8 9 10 11 < Indexes in the base diagonal's SP storage // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 20 21 | 23 24 25 26 27 28 29 30 31 32 33 34 35 36 < Linear IDs: __linear_id_in_steps_range - // ^ | | | - // | __sp_left | __sp_right - // | | - // | __linear_id_in_steps_range + // ^ | | | + // | __sp_left | __sp_right + // | | + // | __linear_id_in_steps_range // We don't save the first diagonal into base diagonal's SP storage !!! const std::size_t __diagonal_idx = __linear_id_in_steps_range / __nd_range_params.steps_between_two_base_diags; From 87bc9446ec7370c4637444d3698dbce5d767a444 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 10:43:01 +0100 Subject: [PATCH 114/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - trace merge Kernel's execution time info --- .../dpcpp/parallel_backend_sycl_merge.h | 56 ++++++++++++++++++- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9ebd6c4b42b..ce25e720d04 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -25,6 +25,13 @@ #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" +#define MERGE_DISPLAY_STATISTIC 1 +#define MERGE_EXCLUDE_NEW_IMPL 0 + +#if MERGE_DISPLAY_STATISTIC +#include +#endif + namespace oneapi { namespace dpl @@ -395,42 +402,85 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; const std::size_t __n = __rng1.size() + __rng2.size(); +#if !MERGE_EXCLUDE_NEW_IMPL if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) +#endif { +#if MERGE_DISPLAY_STATISTIC + const auto __start_time = std::chrono::high_resolution_clock::now(); +#endif + using _WiIndex = std::uint32_t; static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= std::numeric_limits<_WiIndex>::max()); using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + auto __f = __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); + +#if MERGE_SORT_DISPLAY_STATISTIC + __f.wait(); + const auto __stop_time = std::chrono::high_resolution_clock::now(); + const auto __elapsed = __stop_time - __start_time; + std::cout << "__parallel_merge_submitter : merge time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; +#endif + + return __f; } +#if !MERGE_EXCLUDE_NEW_IMPL else { if (__n <= std::numeric_limits::max()) { +#if MERGE_DISPLAY_STATISTIC + const auto __start_time = std::chrono::high_resolution_clock::now(); +#endif + using _WiIndex = std::uint32_t; using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __diagonals_kernel_name<_CustomName, _WiIndex>>; using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + auto __f = __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); + +#if MERGE_SORT_DISPLAY_STATISTIC + __f.wait(); + const auto __stop_time = std::chrono::high_resolution_clock::now(); + const auto __elapsed = __stop_time - __start_time; + std::cout << "__parallel_merge_submitter_large(std::uint32_t) : merge time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; +#endif + + return __f; } else { +#if MERGE_DISPLAY_STATISTIC + const auto __start_time = std::chrono::high_resolution_clock::now(); +#endif + using _WiIndex = std::uint64_t; using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __diagonals_kernel_name<_CustomName, _WiIndex>>; using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + auto __f = __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); + +#if MERGE_SORT_DISPLAY_STATISTIC + __f.wait(); + const auto __stop_time = std::chrono::high_resolution_clock::now(); + const auto __elapsed = __stop_time - __start_time; + std::cout << "__parallel_merge_submitter_large(std::uint64_t) : merge time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; +#endif + + return __f; } } +#endif } } // namespace __par_backend_hetero From 1b1d0bacd8adf6798f77c8b415ab3c2d735916d2 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 10:47:13 +0100 Subject: [PATCH 115/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - calcs get_max_base_diags_count --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index def3fb456c1..4351bddfd71 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -383,7 +383,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const _IndexT __chunk = __is_cpu ? 32 : 4; const _IndexT __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_size, __chunk); - _IndexT __base_diag_count = tune_amount_of_base_diagonals(__rng_size); + _IndexT __base_diag_count = get_max_base_diags_count(__rng_size); _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; From 049bdc279b85609ba8f77ff34234689f9f94f798 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 11:04:42 +0100 Subject: [PATCH 116/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - trace merge Kernel's execution time info --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index ce25e720d04..7e2c3fb9a14 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -419,7 +419,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); -#if MERGE_SORT_DISPLAY_STATISTIC +#if MERGE_DISPLAY_STATISTIC __f.wait(); const auto __stop_time = std::chrono::high_resolution_clock::now(); const auto __elapsed = __stop_time - __start_time; @@ -446,7 +446,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); -#if MERGE_SORT_DISPLAY_STATISTIC +#if MERGE_DISPLAY_STATISTIC __f.wait(); const auto __stop_time = std::chrono::high_resolution_clock::now(); const auto __elapsed = __stop_time - __start_time; @@ -470,7 +470,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); -#if MERGE_SORT_DISPLAY_STATISTIC +#if MERGE_DISPLAY_STATISTIC __f.wait(); const auto __stop_time = std::chrono::high_resolution_clock::now(); const auto __elapsed = __stop_time - __start_time; From 226378eb194446e8cef83ea82032913fc13941a5 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 11:08:46 +0100 Subject: [PATCH 117/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - trace merge Kernel's execution time info --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 7e2c3fb9a14..b4ec3aeafaa 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -423,7 +423,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy __f.wait(); const auto __stop_time = std::chrono::high_resolution_clock::now(); const auto __elapsed = __stop_time - __start_time; - std::cout << "__parallel_merge_submitter : merge time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; + std::cout << "__parallel_merge_submitter : __n = " << __n << ", merge time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; #endif return __f; @@ -450,7 +450,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy __f.wait(); const auto __stop_time = std::chrono::high_resolution_clock::now(); const auto __elapsed = __stop_time - __start_time; - std::cout << "__parallel_merge_submitter_large(std::uint32_t) : merge time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; + std::cout << "__parallel_merge_submitter_large(std::uint32_t) : __n = " << __n << ", merge time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; #endif return __f; @@ -474,7 +474,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy __f.wait(); const auto __stop_time = std::chrono::high_resolution_clock::now(); const auto __elapsed = __stop_time - __start_time; - std::cout << "__parallel_merge_submitter_large(std::uint64_t) : merge time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; + std::cout << "__parallel_merge_submitter_large(std::uint64_t) : __n = " << __n << ", merge time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; #endif return __f; From 570e52b4782d2cab8eac995520856bd874550563 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 17:56:41 +0100 Subject: [PATCH 118/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - log __n_sorted state in __merge_sort_global_submitter::operator() --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 75f19507c05..7afa02aa364 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -615,6 +615,10 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name using __value_type = oneapi::dpl::__internal::__value_t<_Range>; +#if MERGE_SORT_DISPLAY_STATISTIC + std::cout << "__merge_sort_global_submitter::operator() : __n_sorted = " << __n_sorted << std::endl; +#endif + // Calculate nd-range params const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n, __n_sorted); From e844f9372125e30100b90c05b6be15a56232c9b3 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 18:08:04 +0100 Subject: [PATCH 119/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - declare __n_sorted as const in __merge_sort_global_submitter::eval_nd_range_params --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 7afa02aa364..1dd15144f85 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -377,7 +377,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Calculate nd-range params template nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size, _IndexT __n_sorted) const + eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size, const _IndexT __n_sorted) const { const bool __is_cpu = __exec.queue().get_device().is_cpu(); // The chunk size must not exceed two sorted sub-sequences to be merged, From c093ac057c977ecb6faad5e9f3659d472b26bdcc Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 18:56:38 +0100 Subject: [PATCH 120/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix comments for get_max_base_diags_count --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 1dd15144f85..02d3af7c11c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -354,8 +354,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name }; // Return max number of base diagonals: - // 32 Kb of base diagonals for 256 Mb of source data - // 32 Kb / (256 Mb / __n) - for __n elements of source data + // - we empirically found that 32 Kb of base diagonals well fit for 256 Mb of source data std::size_t get_max_base_diags_count(std::size_t __n) const { From d43bf5b2b79d5ea95d6efc8db3af279849f91336 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 21 Jan 2025 18:59:14 +0100 Subject: [PATCH 121/144] Switch off statistics log for merge and merge_sort --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index b4ec3aeafaa..b0379d86b69 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -25,7 +25,7 @@ #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" -#define MERGE_DISPLAY_STATISTIC 1 +//#define MERGE_DISPLAY_STATISTIC 1 #define MERGE_EXCLUDE_NEW_IMPL 0 #if MERGE_DISPLAY_STATISTIC diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 02d3af7c11c..3755676fd9a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -44,7 +44,7 @@ const __SYCL_CONSTANT_AS char fmt_eval_base_diag[] = "Evaluate base diag: __n_so const __SYCL_CONSTANT_AS char fmt_lookup_sp[] = "Lookup sp: __linear_id = %8d, sp_left[%8d] = (%8d, %8d), sp_right[%8d] = (%8d, %8d)\n"; #endif -#define MERGE_SORT_DISPLAY_STATISTIC 1 +//#define MERGE_SORT_DISPLAY_STATISTIC 1 #define MERGE_SORT_EXCLUDE_NEW_IMPL 0 #if MERGE_SORT_DISPLAY_STATISTIC From a273ae5b4bdaf87831092947631ced221b03cb50 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 22 Jan 2025 10:31:04 +0100 Subject: [PATCH 122/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - remove hard-coded numbers from get_max_base_diags_count --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 3755676fd9a..e8a46622baf 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -353,24 +353,12 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } }; - // Return max number of base diagonals: - // - we empirically found that 32 Kb of base diagonals well fit for 256 Mb of source data + template std::size_t - get_max_base_diags_count(std::size_t __n) const + get_max_base_diags_count(_ExecutionPolicy&& __exec, const _IndexT __chunk, std::size_t __n) const { - constexpr std::size_t __max_data_size = 256 * 1024 * 1024; // 256 Mb - - assert(__n <= __max_data_size); - - // TODO required to evaluate this value based on available SLM size for each work-group. - const std::size_t __base_diag_count = 32 * 1'024; - - // Multiply work per item by a power of 2 to reach the desired number of iterations. - // __dpl_bit_ceil rounds the ratio up to the next power of 2. - const std::size_t __k = oneapi::dpl::__internal::__dpl_bit_ceil( - oneapi::dpl::__internal::__dpl_ceiling_div(__max_data_size, __n)); - - return oneapi::dpl::__internal::__dpl_ceiling_div(__base_diag_count, __k); + const std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec); + return oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk * __max_wg_size); } // Calculate nd-range params @@ -384,7 +372,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const _IndexT __chunk = std::min<_IndexT>(__is_cpu ? 32 : 4, __n_sorted * 2); const _IndexT __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_size, __chunk); - _IndexT __base_diag_count = get_max_base_diags_count(__rng_size); + _IndexT __base_diag_count = get_max_base_diags_count(__exec, __chunk, __n_sorted); _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; @@ -635,7 +623,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base; // Max amount of base diagonals - const std::size_t __max_base_diags_count = get_max_base_diags_count(__n); + const std::size_t __max_base_diags_count = get_max_base_diags_count(__exec, __nd_range_params.chunk, __n); for (std::int64_t __i = 0; __i < __n_iter; ++__i) { From d35633535f53fb9ec454d6d05db400f2a9f6a682 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 22 Jan 2025 17:26:46 +0100 Subject: [PATCH 123/144] Remove debug, trace and performance check code --- .../dpcpp/parallel_backend_sycl_merge.h | 56 +----------- .../dpcpp/parallel_backend_sycl_merge_sort.h | 85 ------------------- 2 files changed, 3 insertions(+), 138 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index b0379d86b69..9ebd6c4b42b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -25,13 +25,6 @@ #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" -//#define MERGE_DISPLAY_STATISTIC 1 -#define MERGE_EXCLUDE_NEW_IMPL 0 - -#if MERGE_DISPLAY_STATISTIC -#include -#endif - namespace oneapi { namespace dpl @@ -402,85 +395,42 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; const std::size_t __n = __rng1.size() + __rng2.size(); -#if !MERGE_EXCLUDE_NEW_IMPL if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) -#endif { -#if MERGE_DISPLAY_STATISTIC - const auto __start_time = std::chrono::high_resolution_clock::now(); -#endif - using _WiIndex = std::uint32_t; static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= std::numeric_limits<_WiIndex>::max()); using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; - auto __f = __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); - -#if MERGE_DISPLAY_STATISTIC - __f.wait(); - const auto __stop_time = std::chrono::high_resolution_clock::now(); - const auto __elapsed = __stop_time - __start_time; - std::cout << "__parallel_merge_submitter : __n = " << __n << ", merge time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; -#endif - - return __f; } -#if !MERGE_EXCLUDE_NEW_IMPL else { if (__n <= std::numeric_limits::max()) { -#if MERGE_DISPLAY_STATISTIC - const auto __start_time = std::chrono::high_resolution_clock::now(); -#endif - using _WiIndex = std::uint32_t; using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __diagonals_kernel_name<_CustomName, _WiIndex>>; using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name_large<_CustomName, _WiIndex>>; - auto __f = __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); - -#if MERGE_DISPLAY_STATISTIC - __f.wait(); - const auto __stop_time = std::chrono::high_resolution_clock::now(); - const auto __elapsed = __stop_time - __start_time; - std::cout << "__parallel_merge_submitter_large(std::uint32_t) : __n = " << __n << ", merge time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; -#endif - - return __f; } else { -#if MERGE_DISPLAY_STATISTIC - const auto __start_time = std::chrono::high_resolution_clock::now(); -#endif - using _WiIndex = std::uint64_t; using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __diagonals_kernel_name<_CustomName, _WiIndex>>; using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name_large<_CustomName, _WiIndex>>; - auto __f = __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); - -#if MERGE_DISPLAY_STATISTIC - __f.wait(); - const auto __stop_time = std::chrono::high_resolution_clock::now(); - const auto __elapsed = __stop_time - __start_time; - std::cout << "__parallel_merge_submitter_large(std::uint64_t) : __n = " << __n << ", merge time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; -#endif - - return __f; } } -#endif } } // namespace __par_backend_hetero diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index e8a46622baf..218fdbbb62d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -29,28 +29,6 @@ #include "../../utils_ranges.h" // __difference_t #include "parallel_backend_sycl_merge.h" // __find_start_point, __serial_merge -// TODO remove before merge into main branch -#define TRACE_BASE_DIAGS 0 - -#if TRACE_BASE_DIAGS -# ifdef __SYCL_DEVICE_ONLY__ -# define __SYCL_CONSTANT_AS __attribute__((opencl_constant)) -# else -# define __SYCL_CONSTANT_AS -# endif - -const __SYCL_CONSTANT_AS char fmt_create_diags_storage[] = "Create base diagonals storage: __n_sorted = %8d, storage size = [%8d]\n"; -const __SYCL_CONSTANT_AS char fmt_eval_base_diag[] = "Evaluate base diag: __n_sorted = %8d, __data_in_temp = %s, sp[%8d] = (%8d, %8d)\n"; -const __SYCL_CONSTANT_AS char fmt_lookup_sp[] = "Lookup sp: __linear_id = %8d, sp_left[%8d] = (%8d, %8d), sp_right[%8d] = (%8d, %8d)\n"; -#endif - -//#define MERGE_SORT_DISPLAY_STATISTIC 1 -#define MERGE_SORT_EXCLUDE_NEW_IMPL 0 - -#if MERGE_SORT_DISPLAY_STATISTIC -#include -#endif - namespace oneapi { namespace dpl @@ -441,11 +419,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name ? __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp) : __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp)) : _merge_split_point_t{__data_area.n1, __data_area.n2}; - -#if TRACE_BASE_DIAGS - sycl::ext::oneapi::experimental::printf(fmt_eval_base_diag, __n_sorted, __data_in_temp ? "T" : "F", __linear_id, __sp.first, __sp.second); -#endif - __base_diagonals_sp_global_ptr[__linear_id] = __sp; }); }); @@ -476,11 +449,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const _merge_split_point_t __sp_left = __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{0, 0}; const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; - -#if TRACE_BASE_DIAGS - sycl::ext::oneapi::experimental::printf(fmt_lookup_sp, __linear_id_in_steps_range, __diagonal_idx - 1, __sp_left.first, __sp_left.second, __diagonal_idx, __sp_right.first, __sp_right.second); -#endif - const bool __is_base_diagonal = __linear_id_in_steps_range % __nd_range_params.steps_between_two_base_diags == 0; @@ -602,10 +570,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name using __value_type = oneapi::dpl::__internal::__value_t<_Range>; -#if MERGE_SORT_DISPLAY_STATISTIC - std::cout << "__merge_sort_global_submitter::operator() : __n_sorted = " << __n_sorted << std::endl; -#endif - // Calculate nd-range params const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n, __n_sorted); @@ -627,57 +591,32 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name for (std::int64_t __i = 0; __i < __n_iter; ++__i) { -#if MERGE_SORT_DISPLAY_STATISTIC - const auto __start_time = std::chrono::high_resolution_clock::now(); - bool __new_impl_ran = false; - std::size_t __base_diags_count_evaluated = 0; - std::size_t __base_diags_count_evaluated_for_each_2_n_sorted = 0; -#endif - -#if !MERGE_SORT_EXCLUDE_NEW_IMPL if (2 * __n_sorted < __get_starting_size_limit_for_large_submitter<__value_type>()) -#endif { // Process parallel merge __event_chain = run_parallel_merge(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, __comp, __nd_range_params); } -#if !MERGE_SORT_EXCLUDE_NEW_IMPL else { -#if MERGE_SORT_DISPLAY_STATISTIC - __new_impl_ran = true; -#endif - if (nullptr == __p_base_diagonals_sp_global_storage) { // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t(__exec, 0, __max_base_diags_count); -#if TRACE_BASE_DIAGS - sycl::ext::oneapi::experimental::printf(fmt_create_diags_storage, __n_sorted, __max_base_diags_count); -#endif - // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. __p_result_and_scratch_storage_base.reset( static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); } nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __n_sorted); -#if MERGE_SORT_DISPLAY_STATISTIC - __base_diags_count_evaluated_for_each_2_n_sorted = __nd_range_params_this.base_diag_count; -#endif const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); __nd_range_params_this.base_diag_count *= __portions; __nd_range_params_this.steps *= __portions; assert(__nd_range_params_this.base_diag_count <= __max_base_diags_count); -#if MERGE_SORT_DISPLAY_STATISTIC - __base_diags_count_evaluated = __nd_range_params_this.base_diag_count; -#endif - // Calculation of split-points on each base diagonal __event_chain = eval_split_points_for_groups(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, @@ -688,19 +627,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __rng, __temp_buf, __comp, __nd_range_params_this, *__p_base_diagonals_sp_global_storage); } -#endif - -#if MERGE_SORT_DISPLAY_STATISTIC - __event_chain.wait_and_throw(); - const auto __stop_time = std::chrono::high_resolution_clock::now(); - const auto __elapsed = __stop_time - __start_time; - if (__new_impl_ran) - std::cout << "(N) "; - std::cout << "iteration: " << __i << " __n_sorted = " << __n_sorted << ", time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) "; - if (__new_impl_ran) - std::cout << "base diags for every merge matrix: " << __base_diags_count_evaluated_for_each_2_n_sorted << ", all base_diags = " << __base_diags_count_evaluated; - std::cout << std::endl; -#endif __n_sorted *= 2; __data_in_temp = !__data_in_temp; @@ -773,20 +699,9 @@ __merge_sort(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSo sycl::queue __q = __exec.queue(); -#if MERGE_SORT_DISPLAY_STATISTIC - const auto __start_time = std::chrono::high_resolution_clock::now(); -#endif - // 1. Perform sorting of the leaves of the merge sort tree sycl::event __event_leaf_sort = __merge_sort_leaf_submitter<_LeafSortKernel>()(__q, __rng, __comp, __leaf_sorter); -#if MERGE_SORT_DISPLAY_STATISTIC - __event_leaf_sort.wait_and_throw(); - const auto __stop_time = std::chrono::high_resolution_clock::now(); - const auto __elapsed = __stop_time - __start_time; - std::cout << "leaf sorter: time = " << std::chrono::duration_cast(__elapsed).count() << " (mcs) " << std::endl; -#endif - // 2. Merge sorting oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, _Tp> __temp(__exec, __rng.size()); auto __temp_buf = __temp.get_buffer(); From 3bf5faf3e4aa22232b3e218c35121c506fd1fe1b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 23 Jan 2025 16:05:35 +0100 Subject: [PATCH 124/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - additional assert in __merge_sort_global_submitter::operator() --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 218fdbbb62d..3b416fc1a27 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -612,6 +612,9 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __n_sorted); + // Check that each base diagonal started from beginning of merge matrix + assert(0 == (2 * __n_sorted) % (__nd_range_params_this.steps_between_two_base_diags * __nd_range_params_this.chink); + const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); __nd_range_params_this.base_diag_count *= __portions; __nd_range_params_this.steps *= __portions; From 38651996d1dab0ab38dcf3203bd29f529692d5f7 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 23 Jan 2025 17:01:17 +0100 Subject: [PATCH 125/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - additional assert in __merge_sort_global_submitter::operator() --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 3b416fc1a27..1a57321e0f6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -613,7 +613,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __n_sorted); // Check that each base diagonal started from beginning of merge matrix - assert(0 == (2 * __n_sorted) % (__nd_range_params_this.steps_between_two_base_diags * __nd_range_params_this.chink); + assert(0 == (2 * __n_sorted) % (__nd_range_params_this.steps_between_two_base_diags * __nd_range_params_this.chink)); const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); __nd_range_params_this.base_diag_count *= __portions; From e080851b7d8feca56fb084c327e4a8f7e379aac0 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 24 Jan 2025 09:33:32 +0100 Subject: [PATCH 126/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: using _IndexT instead of decltype(__data_area.n1) --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 1a57321e0f6..239ab6c9bba 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -360,8 +360,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name inline static _merge_split_point_t __find_start_point_w(const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp) { - return __find_start_point(__views.rng1, decltype(__data_area.n1){0}, __data_area.n1, __views.rng2, - decltype(__data_area.n2){0}, __data_area.n2, __data_area.i_elem_local, __comp); + return __find_start_point(__views.rng1, _IndexT{0}, __data_area.n1, __views.rng2, _IndexT{0}, __data_area.n2, + __data_area.i_elem_local, __comp); } template From d78363344cb3868fbe1bfaa50da2f0dcb9b68ecd Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 24 Jan 2025 09:35:07 +0100 Subject: [PATCH 127/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: error in comments --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 239ab6c9bba..0670b5ab389 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -397,7 +397,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name sycl::accessor __dst(__temp_buf, __cgh, sycl::read_write, sycl::no_init); __cgh.parallel_for<_DiagonalsKernelName...>( - // +1 doesn't required here, because we need to calculate split points for each base diagonal + // +1 is not required here, because we need to calculate split points for each base diagonal // and for the right base diagonal in the last work-group but we can keep it one position to the left // because we know that for 0-diagonal the split point is { 0, 0 }. sycl::range(__nd_range_params.base_diag_count /*+ 1*/), From 5ede9907074424bbfa104cff99b02843d656b386 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 24 Jan 2025 09:42:47 +0100 Subject: [PATCH 128/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: replace some universal reference by ref / const ref --- .../hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 0670b5ab389..eb22885c3ee 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -333,7 +333,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name template std::size_t - get_max_base_diags_count(_ExecutionPolicy&& __exec, const _IndexT __chunk, std::size_t __n) const + get_max_base_diags_count(const _ExecutionPolicy& __exec, const _IndexT __chunk, std::size_t __n) const { const std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec); return oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk * __max_wg_size); @@ -342,7 +342,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Calculate nd-range params template nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size, const _IndexT __n_sorted) const + eval_nd_range_params(const _ExecutionPolicy& __exec, const std::size_t __rng_size, const _IndexT __n_sorted) const { const bool __is_cpu = __exec.queue().get_device().is_cpu(); // The chunk size must not exceed two sorted sub-sequences to be merged, @@ -378,8 +378,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name template sycl::event eval_split_points_for_groups(const sycl::event& __event_chain, const _IndexT __n_sorted, const bool __data_in_temp, - _ExecutionPolicy&& __exec, _Range&& __rng, _TempBuf& __temp_buf, _Compare __comp, - const nd_range_params& __nd_range_params, + const _ExecutionPolicy& __exec, const _Range& __rng, _TempBuf& __temp_buf, + _Compare __comp, const nd_range_params& __nd_range_params, _Storage& __base_diagonals_sp_global_storage) const { const _IndexT __n = __rng.size(); @@ -467,7 +467,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name template sycl::event run_parallel_merge(const sycl::event& __event_chain, const _IndexT __n_sorted, const bool __data_in_temp, - _ExecutionPolicy&& __exec, _Range&& __rng, _TempBuf& __temp_buf, _Compare __comp, + const _ExecutionPolicy& __exec, _Range& __rng, _TempBuf& __temp_buf, _Compare __comp, const nd_range_params& __nd_range_params) const { const _IndexT __n = __rng.size(); @@ -508,7 +508,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name template sycl::event run_parallel_merge_from_diagonals(const sycl::event& __event_chain, const _IndexT __n_sorted, - const bool __data_in_temp, _ExecutionPolicy&& __exec, _Range&& __rng, + const bool __data_in_temp, const _ExecutionPolicy& __exec, _Range& __rng, _TempBuf& __temp_buf, _Compare __comp, const nd_range_params& __nd_range_params, _Storage& __base_diagonals_sp_global_storage) const { From 6f2b9edcb8bdcd0306a81f2d582147662dbb0430 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 24 Jan 2025 09:44:56 +0100 Subject: [PATCH 129/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix compile error --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index eb22885c3ee..029b6c0cce9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -613,7 +613,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __n_sorted); // Check that each base diagonal started from beginning of merge matrix - assert(0 == (2 * __n_sorted) % (__nd_range_params_this.steps_between_two_base_diags * __nd_range_params_this.chink)); + assert(0 == (2 * __n_sorted) % (__nd_range_params_this.steps_between_two_base_diags * __nd_range_params_this.chunk)); const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); __nd_range_params_this.base_diag_count *= __portions; From 2ee3d7e80ed14cd0282e05377077300d00a70bed Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 24 Jan 2025 11:57:25 +0100 Subject: [PATCH 130/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - remove __chunk calculation from Kernel code in eval_split_points_for_groups --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 029b6c0cce9..bb5e58966a0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -396,6 +396,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name sycl::accessor __dst(__temp_buf, __cgh, sycl::read_write, sycl::no_init); + const std::size_t __chunk = __nd_range_params.chunk * __nd_range_params.steps_between_two_base_diags; + __cgh.parallel_for<_DiagonalsKernelName...>( // +1 is not required here, because we need to calculate split points for each base diagonal // and for the right base diagonal in the last work-group but we can keep it one position to the left @@ -409,9 +411,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // We should add `1` to __linear_id here to avoid calculation of split-point for 0-diagonal // Please see additional explanations in the __lookup_sp function below. - const WorkDataArea __data_area(__n, __n_sorted, __linear_id + 1, - __nd_range_params.chunk * - __nd_range_params.steps_between_two_base_diags); + const WorkDataArea __data_area(__n, __n_sorted, __linear_id + 1, __chunk); const auto __sp = __data_area.is_i_elem_local_inside_merge_matrix() From 7488c94cd47437fb928deac41749027a5e166724 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 24 Jan 2025 18:15:51 +0100 Subject: [PATCH 131/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: remove condition check in __lookup_sp --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index bb5e58966a0..915fe491344 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -430,6 +430,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp, _BaseDiagonalsSPStorage __base_diagonals_sp_global_ptr) { + constexpr _merge_split_point_t __sp_zero(0, 0); + // | subrange 0 | subrange 1 | subrange 2 | subrange 3 | subrange 4 // | contains (2 * __n_sorted values) | contains (2 * __n_sorted values) | contains (2 * __n_sorted values) | contains (2 * __n_sorted values) | contains the rest of data... < Data parts // |----/----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----/----|----/--- < Steps @@ -448,19 +450,15 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const std::size_t __diagonal_idx = __linear_id_in_steps_range / __nd_range_params.steps_between_two_base_diags; const _merge_split_point_t __sp_left = __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{0, 0}; - const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; + _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; + __sp_right = __sp_right != __sp_zero ? __sp_right : _merge_split_point_t{__data_area.n1, __data_area.n2}; const bool __is_base_diagonal = __linear_id_in_steps_range % __nd_range_params.steps_between_two_base_diags == 0; - if (__sp_right.first + __sp_right.second > 0) - { - return __is_base_diagonal - ? __sp_left - : __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, - __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp); - } - - return __find_start_point_w(__data_area, __views, __comp); + return __is_base_diagonal + ? __sp_left + : __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, + __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp); } // Process parallel merge From f519fcc26632bf1bd5483abc31dba092a77eb967 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 27 Jan 2025 09:02:48 +0100 Subject: [PATCH 132/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: It looks like we just write to the scratch buffer --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 915fe491344..cbd3da1ba60 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -391,7 +391,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name oneapi::dpl::__ranges::__require_access(__cgh, __rng); auto __base_diagonals_sp_global_acc = - __base_diagonals_sp_global_storage.template __get_scratch_acc( + __base_diagonals_sp_global_storage.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); sycl::accessor __dst(__temp_buf, __cgh, sycl::read_write, sycl::no_init); From 81a61197bd19b1982728042b317032453201d73a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 27 Jan 2025 12:06:43 +0100 Subject: [PATCH 133/144] Revert "include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: remove condition check in __lookup_sp" This reverts commit 7488c94cd47437fb928deac41749027a5e166724. --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index cbd3da1ba60..c700d341989 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -430,8 +430,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp, _BaseDiagonalsSPStorage __base_diagonals_sp_global_ptr) { - constexpr _merge_split_point_t __sp_zero(0, 0); - // | subrange 0 | subrange 1 | subrange 2 | subrange 3 | subrange 4 // | contains (2 * __n_sorted values) | contains (2 * __n_sorted values) | contains (2 * __n_sorted values) | contains (2 * __n_sorted values) | contains the rest of data... < Data parts // |----/----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----/----|----/--- < Steps @@ -450,15 +448,19 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const std::size_t __diagonal_idx = __linear_id_in_steps_range / __nd_range_params.steps_between_two_base_diags; const _merge_split_point_t __sp_left = __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{0, 0}; - _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; - __sp_right = __sp_right != __sp_zero ? __sp_right : _merge_split_point_t{__data_area.n1, __data_area.n2}; + const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; const bool __is_base_diagonal = __linear_id_in_steps_range % __nd_range_params.steps_between_two_base_diags == 0; - return __is_base_diagonal - ? __sp_left - : __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, - __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp); + if (__sp_right.first + __sp_right.second > 0) + { + return __is_base_diagonal + ? __sp_left + : __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, + __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp); + } + + return __find_start_point_w(__data_area, __views, __comp); } // Process parallel merge From 1af529b6b11b6ddd3a459f0a5fd3cd8bb84ca99b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 27 Jan 2025 09:36:59 +0100 Subject: [PATCH 134/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix self review comment: redesign work with base diagonals --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 52 ++++++++++--------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index c700d341989..fdf13d06530 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -240,6 +240,9 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name private: using _merge_split_point_t = _split_point_t<_IndexT>; + // 1 final base diagonal for save final sp(0,0) + static constexpr std::size_t __1_final_base_diag = 1; + struct nd_range_params { std::size_t base_diag_count = 0; @@ -399,19 +402,14 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const std::size_t __chunk = __nd_range_params.chunk * __nd_range_params.steps_between_two_base_diags; __cgh.parallel_for<_DiagonalsKernelName...>( - // +1 is not required here, because we need to calculate split points for each base diagonal - // and for the right base diagonal in the last work-group but we can keep it one position to the left - // because we know that for 0-diagonal the split point is { 0, 0 }. - sycl::range(__nd_range_params.base_diag_count /*+ 1*/), + sycl::range(__nd_range_params.base_diag_count), [=](sycl::item __item_id) { const std::size_t __linear_id = __item_id.get_linear_id(); auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - // We should add `1` to __linear_id here to avoid calculation of split-point for 0-diagonal - // Please see additional explanations in the __lookup_sp function below. - const WorkDataArea __data_area(__n, __n_sorted, __linear_id + 1, __chunk); + const WorkDataArea __data_area(__n, __n_sorted, __linear_id, __chunk); const auto __sp = __data_area.is_i_elem_local_inside_merge_matrix() @@ -437,29 +435,35 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // | | | | | | | | | | | | | | // bd00 bd01 bd02 bd10 bd11 bd12 bd20 bd21 | bd22 bd30 bd31 bd32 bd40 < Base diagonals // ^ ^ ^ ^ ^ ^ ^ | ^ ^ ^ ^ ^ - // --- 0 1 2 3 4 5 6 | 7 8 9 10 11 < Indexes in the base diagonal's SP storage + // 0 1 2 3 4 5 6 | 7 8 9 10 11 12 xIdx < Indexes in the base diagonal's SP storage // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 20 21 | 23 24 25 26 27 28 29 30 31 32 33 34 35 36 < Linear IDs: __linear_id_in_steps_range - // ^ | | | - // | __sp_left | __sp_right - // | | - // | __linear_id_in_steps_range - // We don't save the first diagonal into base diagonal's SP storage !!! + // ^ ^ ^ | | | ^ ^ ^ + // (0,0) (0,0) (0,0) __sp_left | __sp_right (0,0) (0,0) (0,0) < Every first base diagonal of sub-task is (0,0) + // | final additional split-point + // __linear_id_in_steps_range const std::size_t __diagonal_idx = __linear_id_in_steps_range / __nd_range_params.steps_between_two_base_diags; - const _merge_split_point_t __sp_left = __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{0, 0}; - const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; - const bool __is_base_diagonal = - __linear_id_in_steps_range % __nd_range_params.steps_between_two_base_diags == 0; + if (__linear_id_in_steps_range % __nd_range_params.steps_between_two_base_diags != 0) + { + const _merge_split_point_t __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; - if (__sp_right.first + __sp_right.second > 0) + // We should check this condition because the first diagonal for every next sub-task + // and additional final diagonal has split-points equal (0, 0) and we can't use them in calculations. + if (__sp_right.first + __sp_right.second > 0) + { + return __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, + __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp); + } + } + else { - return __is_base_diagonal - ? __sp_left - : __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, - __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp); + return __base_diagonals_sp_global_ptr[__diagonal_idx]; } + // Find split-points on final diagonals of every sub-task: their length is too short so we + // find split-points without any limitations by base diagonals. return __find_start_point_w(__data_area, __views, __comp); } @@ -587,7 +591,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base; // Max amount of base diagonals - const std::size_t __max_base_diags_count = get_max_base_diags_count(__exec, __nd_range_params.chunk, __n); + const std::size_t __max_base_diags_count = get_max_base_diags_count(__exec, __nd_range_params.chunk, __n) + __1_final_base_diag; for (std::int64_t __i = 0; __i < __n_iter; ++__i) { @@ -616,7 +620,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name assert(0 == (2 * __n_sorted) % (__nd_range_params_this.steps_between_two_base_diags * __nd_range_params_this.chunk)); const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); - __nd_range_params_this.base_diag_count *= __portions; + __nd_range_params_this.base_diag_count = __nd_range_params_this.base_diag_count * __portions + __1_final_base_diag; __nd_range_params_this.steps *= __portions; assert(__nd_range_params_this.base_diag_count <= __max_base_diags_count); From 3b9d57791c0d938f9c4b169b8aef4fc152616503 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 27 Jan 2025 14:18:40 +0100 Subject: [PATCH 135/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment: remove postfix _w from function names inside __merge_sort_global_submitter class --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 52 ++++++++++--------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index fdf13d06530..8f3f38fb087 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -361,20 +361,21 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name template inline static _merge_split_point_t - __find_start_point_w(const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp) + __find_start_point(const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp) { - return __find_start_point(__views.rng1, _IndexT{0}, __data_area.n1, __views.rng2, _IndexT{0}, __data_area.n2, - __data_area.i_elem_local, __comp); + return oneapi::dpl::__par_backend_hetero::__find_start_point(__views.rng1, _IndexT{0}, __data_area.n1, + __views.rng2, _IndexT{0}, __data_area.n2, + __data_area.i_elem_local, __comp); } template inline static void - __serial_merge_w(const nd_range_params& __nd_range_params, const WorkDataArea& __data_area, - const DropViews& __views, _Rng& __rng, const _merge_split_point_t& __sp, _Compare __comp) + __serial_merge(const nd_range_params& __nd_range_params, const WorkDataArea& __data_area, + const DropViews& __views, _Rng& __rng, const _merge_split_point_t& __sp, _Compare __comp) { - __serial_merge(__views.rng1, __views.rng2, __rng /* rng3 */, __sp.first /* start1 */, __sp.second /* start2 */, - __data_area.i_elem /* start3 */, __nd_range_params.chunk, __data_area.n1, __data_area.n2, - __comp); + oneapi::dpl::__par_backend_hetero::__serial_merge( + __views.rng1, __views.rng2, __rng /* rng3 */, __sp.first /* start1 */, __sp.second /* start2 */, + __data_area.i_elem /* start3 */, __nd_range_params.chunk, __data_area.n1, __data_area.n2, __comp); } // Calculation of split points on each base diagonal @@ -414,8 +415,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const auto __sp = __data_area.is_i_elem_local_inside_merge_matrix() ? (__data_in_temp - ? __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp) - : __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp)) + ? __find_start_point(__data_area, DropViews(__dst, __data_area), __comp) + : __find_start_point(__data_area, DropViews(__rng, __data_area), __comp)) : _merge_split_point_t{__data_area.n1, __data_area.n2}; __base_diagonals_sp_global_ptr[__linear_id] = __sp; }); @@ -453,8 +454,9 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // and additional final diagonal has split-points equal (0, 0) and we can't use them in calculations. if (__sp_right.first + __sp_right.second > 0) { - return __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, - __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp); + return oneapi::dpl::__par_backend_hetero::__find_start_point( + __views.rng1, __sp_left.first, __sp_right.first, __views.rng2, __sp_left.second, __sp_right.second, + __data_area.i_elem_local, __comp); } } else @@ -464,7 +466,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Find split-points on final diagonals of every sub-task: their length is too short so we // find split-points without any limitations by base diagonals. - return __find_start_point_w(__data_area, __views, __comp); + return __find_start_point(__data_area, __views, __comp); } // Process parallel merge @@ -494,14 +496,14 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name if (__data_in_temp) { DropViews __views(__dst, __data_area); - __serial_merge_w(__nd_range_params, __data_area, __views, __rng, - __find_start_point_w(__data_area, __views, __comp), __comp); + __serial_merge(__nd_range_params, __data_area, __views, __rng, + __find_start_point(__data_area, __views, __comp), __comp); } else { DropViews __views(__rng, __data_area); - __serial_merge_w(__nd_range_params, __data_area, __views, __dst, - __find_start_point_w(__data_area, __views, __comp), __comp); + __serial_merge(__nd_range_params, __data_area, __views, __dst, + __find_start_point(__data_area, __views, __comp), __comp); } } }); @@ -543,18 +545,18 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name if (__data_in_temp) { DropViews __views(__dst, __data_area); - __serial_merge_w(__nd_range_params, __data_area, __views, __rng, - __lookup_sp(__linear_id, __nd_range_params, __data_area, __views, __comp, - __base_diagonals_sp_global_ptr), - __comp); + __serial_merge(__nd_range_params, __data_area, __views, __rng, + __lookup_sp(__linear_id, __nd_range_params, __data_area, __views, __comp, + __base_diagonals_sp_global_ptr), + __comp); } else { DropViews __views(__rng, __data_area); - __serial_merge_w(__nd_range_params, __data_area, __views, __dst, - __lookup_sp(__linear_id, __nd_range_params, __data_area, __views, __comp, - __base_diagonals_sp_global_ptr), - __comp); + __serial_merge(__nd_range_params, __data_area, __views, __dst, + __lookup_sp(__linear_id, __nd_range_params, __data_area, __views, __comp, + __base_diagonals_sp_global_ptr), + __comp); } } }); From 99fa2d05e3de5c3964f5e63eaa256951de5edaec Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 27 Jan 2025 14:20:41 +0100 Subject: [PATCH 136/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix self review comment: redesign work with base diagonals --- .../hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 8f3f38fb087..a1e8e928bdc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -458,15 +458,13 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __views.rng1, __sp_left.first, __sp_right.first, __views.rng2, __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp); } - } - else - { - return __base_diagonals_sp_global_ptr[__diagonal_idx]; + + // Find split-points on final diagonals of every sub-task: their length is too short so we + // find split-points without any limitations by base diagonals. + return __find_start_point(__data_area, __views, __comp); } - // Find split-points on final diagonals of every sub-task: their length is too short so we - // find split-points without any limitations by base diagonals. - return __find_start_point(__data_area, __views, __comp); + return __base_diagonals_sp_global_ptr[__diagonal_idx]; } // Process parallel merge From 3b6140f35750409247784bf205f7cd68b2c6dbd3 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 27 Jan 2025 19:14:08 +0100 Subject: [PATCH 137/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - additional comment --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index a1e8e928bdc..7d7c85204cc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -464,6 +464,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name return __find_start_point(__data_area, __views, __comp); } + // We are on base diagonal so just simple return split-point from them return __base_diagonals_sp_global_ptr[__diagonal_idx]; } From 0859092d599e2824990d86f960b31ae311f2e66e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 27 Jan 2025 19:16:02 +0100 Subject: [PATCH 138/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - additional comment --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 7d7c85204cc..9ac3e959704 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -447,6 +447,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name if (__linear_id_in_steps_range % __nd_range_params.steps_between_two_base_diags != 0) { + // We are between two base diagonals (__sp_left, __sp_right) const _merge_split_point_t __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; From 70d64a065284756f0f8f9592add811d50ed0d131 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 27 Jan 2025 19:30:48 +0100 Subject: [PATCH 139/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 9ac3e959704..c2647f96768 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -240,9 +240,6 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name private: using _merge_split_point_t = _split_point_t<_IndexT>; - // 1 final base diagonal for save final sp(0,0) - static constexpr std::size_t __1_final_base_diag = 1; - struct nd_range_params { std::size_t base_diag_count = 0; @@ -569,6 +566,9 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name operator()(_ExecutionPolicy&& __exec, _Range& __rng, _Compare __comp, _LeafSizeT __leaf_size, _TempBuf& __temp_buf, sycl::event __event_chain) const { + // 1 final base diagonal for save final sp(0,0) + constexpr std::size_t __1_final_base_diag = 1; + const _IndexT __n = __rng.size(); _IndexT __n_sorted = __leaf_size; From 58ed040103d448f6261ed206fb247b6710b6ba47 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 27 Jan 2025 19:52:03 +0100 Subject: [PATCH 140/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index c2647f96768..b55089cacbc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -459,7 +459,10 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Find split-points on final diagonals of every sub-task: their length is too short so we // find split-points without any limitations by base diagonals. - return __find_start_point(__data_area, __views, __comp); + // - we use here (__data_area.n1, __data_area.m2) instead of __sp_right + return oneapi::dpl::__par_backend_hetero::__find_start_point(__views.rng1, __sp_left.first, __data_area.n1, + __views.rng2, __sp_left.second, __data_area.n2, + __data_area.i_elem_local, __comp); } // We are on base diagonal so just simple return split-point from them From f62ace72d31ce6c1fb399da1bfeb4e1bd3993e44 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 27 Jan 2025 20:12:38 +0100 Subject: [PATCH 141/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix review comment --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index b55089cacbc..88645578f6e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -420,6 +420,18 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name }); } + template + inline static _merge_split_point_t + __get_right_sp(_BaseDiagonalsSPStorage __base_diagonals_sp_global_ptr, const std::size_t __diagonal_idx, + const WorkDataArea& __data_area) + { + _merge_split_point_t __result = __base_diagonals_sp_global_ptr[__diagonal_idx]; + __result = + __result.first + __result.second > 0 ? __result : _merge_split_point_t{__data_area.n1, __data_area.n2}; + + return __result; + } + template inline static _merge_split_point_t __lookup_sp(const std::size_t __linear_id_in_steps_range, const nd_range_params& __nd_range_params, @@ -446,23 +458,11 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name { // We are between two base diagonals (__sp_left, __sp_right) const _merge_split_point_t __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; - const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; - - // We should check this condition because the first diagonal for every next sub-task - // and additional final diagonal has split-points equal (0, 0) and we can't use them in calculations. - if (__sp_right.first + __sp_right.second > 0) - { - return oneapi::dpl::__par_backend_hetero::__find_start_point( - __views.rng1, __sp_left.first, __sp_right.first, __views.rng2, __sp_left.second, __sp_right.second, - __data_area.i_elem_local, __comp); - } + const _merge_split_point_t __sp_right = __get_right_sp(__base_diagonals_sp_global_ptr, __diagonal_idx + 1, __data_area); - // Find split-points on final diagonals of every sub-task: their length is too short so we - // find split-points without any limitations by base diagonals. - // - we use here (__data_area.n1, __data_area.m2) instead of __sp_right - return oneapi::dpl::__par_backend_hetero::__find_start_point(__views.rng1, __sp_left.first, __data_area.n1, - __views.rng2, __sp_left.second, __data_area.n2, - __data_area.i_elem_local, __comp); + return oneapi::dpl::__par_backend_hetero::__find_start_point( + __views.rng1, __sp_left.first, __sp_right.first, __views.rng2, __sp_left.second, __sp_right.second, + __data_area.i_elem_local, __comp); } // We are on base diagonal so just simple return split-point from them From c39878bcaad4adaf17e5b46bb82b9fafd5fef059 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 27 Jan 2025 20:14:48 +0100 Subject: [PATCH 142/144] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - additional TODO --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 88645578f6e..4017962906d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -600,6 +600,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name for (std::int64_t __i = 0; __i < __n_iter; ++__i) { + // TODO required to re-check threshold data size if (2 * __n_sorted < __get_starting_size_limit_for_large_submitter<__value_type>()) { // Process parallel merge From 217a39baf16fe78e1c7ba2f7263dfeb8c0a15127 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 27 Jan 2025 20:25:09 +0100 Subject: [PATCH 143/144] Fix review comments: test large data sizes for merge_sort on CPU too --- .../parallel_api/algorithm/alg.merge/merge.pass.cpp | 13 +++++-------- .../algorithm/alg.sorting/sort.pass.cpp | 13 +++++-------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp index e41f74005e7..4f4ba75a369 100644 --- a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp +++ b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp @@ -186,15 +186,12 @@ main() auto fstep_small = [](std::size_t size){ return size <= 16 ? size + 1 : size_t(3.1415 * size);}; test_merge_by_type(start_size_small, max_size_small, fstep_small); - // Large data sizes (on GPU only) + // Large data sizes #if TEST_DPCPP_BACKEND_PRESENT - if (!TestUtils::get_test_queue().get_device().is_cpu()) - { - const size_t start_size_large = 4'000'000; - const size_t max_size_large = 8'000'000; - auto fstep_large = [](std::size_t size){ return size + 2'000'000; }; - test_merge_by_type(start_size_large, max_size_large, fstep_large); - } + const size_t start_size_large = 4'000'000; + const size_t max_size_large = 8'000'000; + auto fstep_large = [](std::size_t size){ return size + 2'000'000; }; + test_merge_by_type(start_size_large, max_size_large, fstep_large); #endif #if !TEST_DPCPP_BACKEND_PRESENT diff --git a/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp b/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp index 500e8f43035..09dad8c43f3 100644 --- a/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp +++ b/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp @@ -488,16 +488,13 @@ main() test_sort<100>(start_size_small, max_size_small, fstep_small); - // Large data sizes (on GPU only) + // Large data sizes #if TEST_DPCPP_BACKEND_PRESENT - if (!TestUtils::get_test_queue().get_device().is_cpu()) - { - const size_t start_size_large = 4'000'000; - const size_t max_size_large = 8'000'000; - auto fstep_large = [](std::size_t size){ return size + 2'000'000; }; + const size_t start_size_large = 4'000'000; + const size_t max_size_large = 8'000'000; + auto fstep_large = [](std::size_t size){ return size + 2'000'000; }; - test_sort<200>(start_size_large, max_size_large, fstep_large); - } + test_sort<200>(start_size_large, max_size_large, fstep_large); #endif } From 0a6394824be615a579ca6b968d7cfee1777428e9 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 28 Jan 2025 09:35:27 +0100 Subject: [PATCH 144/144] Apply GitHUB clang format --- .../dpcpp/parallel_backend_sycl_merge_sort.h | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 4017962906d..1f5b5a22add 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -367,8 +367,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name template inline static void - __serial_merge(const nd_range_params& __nd_range_params, const WorkDataArea& __data_area, - const DropViews& __views, _Rng& __rng, const _merge_split_point_t& __sp, _Compare __comp) + __serial_merge(const nd_range_params& __nd_range_params, const WorkDataArea& __data_area, const DropViews& __views, + _Rng& __rng, const _merge_split_point_t& __sp, _Compare __comp) { oneapi::dpl::__par_backend_hetero::__serial_merge( __views.rng1, __views.rng2, __rng /* rng3 */, __sp.first /* start1 */, __sp.second /* start2 */, @@ -400,8 +400,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const std::size_t __chunk = __nd_range_params.chunk * __nd_range_params.steps_between_two_base_diags; __cgh.parallel_for<_DiagonalsKernelName...>( - sycl::range(__nd_range_params.base_diag_count), - [=](sycl::item __item_id) { + sycl::range(__nd_range_params.base_diag_count), [=](sycl::item __item_id) { const std::size_t __linear_id = __item_id.get_linear_id(); auto __base_diagonals_sp_global_ptr = @@ -409,11 +408,10 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const WorkDataArea __data_area(__n, __n_sorted, __linear_id, __chunk); - const auto __sp = + const auto __sp = __data_area.is_i_elem_local_inside_merge_matrix() - ? (__data_in_temp - ? __find_start_point(__data_area, DropViews(__dst, __data_area), __comp) - : __find_start_point(__data_area, DropViews(__rng, __data_area), __comp)) + ? (__data_in_temp ? __find_start_point(__data_area, DropViews(__dst, __data_area), __comp) + : __find_start_point(__data_area, DropViews(__rng, __data_area), __comp)) : _merge_split_point_t{__data_area.n1, __data_area.n2}; __base_diagonals_sp_global_ptr[__linear_id] = __sp; }); @@ -457,8 +455,9 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name if (__linear_id_in_steps_range % __nd_range_params.steps_between_two_base_diags != 0) { // We are between two base diagonals (__sp_left, __sp_right) - const _merge_split_point_t __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; - const _merge_split_point_t __sp_right = __get_right_sp(__base_diagonals_sp_global_ptr, __diagonal_idx + 1, __data_area); + const _merge_split_point_t __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const _merge_split_point_t __sp_right = + __get_right_sp(__base_diagonals_sp_global_ptr, __diagonal_idx + 1, __data_area); return oneapi::dpl::__par_backend_hetero::__find_start_point( __views.rng1, __sp_left.first, __sp_right.first, __views.rng2, __sp_left.second, __sp_right.second, @@ -596,7 +595,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base; // Max amount of base diagonals - const std::size_t __max_base_diags_count = get_max_base_diags_count(__exec, __nd_range_params.chunk, __n) + __1_final_base_diag; + const std::size_t __max_base_diags_count = + get_max_base_diags_count(__exec, __nd_range_params.chunk, __n) + __1_final_base_diag; for (std::int64_t __i = 0; __i < __n_iter; ++__i) { @@ -620,13 +620,16 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); } - nd_range_params __nd_range_params_this = eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __n_sorted); + nd_range_params __nd_range_params_this = + eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __n_sorted); // Check that each base diagonal started from beginning of merge matrix - assert(0 == (2 * __n_sorted) % (__nd_range_params_this.steps_between_two_base_diags * __nd_range_params_this.chunk)); + assert(0 == (2 * __n_sorted) % + (__nd_range_params_this.steps_between_two_base_diags * __nd_range_params_this.chunk)); const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); - __nd_range_params_this.base_diag_count = __nd_range_params_this.base_diag_count * __portions + __1_final_base_diag; + __nd_range_params_this.base_diag_count = + __nd_range_params_this.base_diag_count * __portions + __1_final_base_diag; __nd_range_params_this.steps *= __portions; assert(__nd_range_params_this.base_diag_count <= __max_base_diags_count);