From 7bbaf83bde5043524e0ffae85a74df87a22203a1 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com> Date: Fri, 31 Jan 2025 14:11:21 -0500 Subject: [PATCH] Bugfix to decay Policy for `__result_and_scratch_storage` (#2031) Signed-off-by: Dan Hoeflinger --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 47 ++++++++++--------- .../dpcpp/parallel_backend_sycl_reduce.h | 14 +++--- .../dpcpp/parallel_backend_sycl_utils.h | 18 ++++--- 3 files changed, 43 insertions(+), 36 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 36e5e165685..a1fcc1f8e15 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -615,8 +615,7 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend // Although we do not actually need result storage in this case, we need to construct // a placeholder here to match the return type of the non-single-work-group implementation - using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _ValueType>; - __result_and_scratch_storage_t __dummy_result_and_scratch{__exec, 0, 0}; + __result_and_scratch_storage<_ExecutionPolicy, _ValueType> __dummy_result_and_scratch{__exec, 0, 0}; if (__max_wg_size >= __targeted_wg_size) { @@ -1093,7 +1092,7 @@ struct __write_to_id_if_else template auto -__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec, +__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __in_rng, _Range2&& __out_rng, std::size_t __n, _UnaryOperation __unary_op, _InitType __init, _BinaryOperation __binary_op, _Inclusive) { @@ -1122,9 +1121,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen std::size_t __single_group_upper_limit = __use_reduce_then_scan ? 2048 : 16384; if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform, __single_group_upper_limit)) { - return __parallel_transform_scan_single_group(__backend_tag, __exec, std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __n, __unary_op, __init, - __binary_op, _Inclusive{}); + return __parallel_transform_scan_single_group( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), + std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{}); } } #if _ONEDPL_COMPILE_KERNEL @@ -1161,7 +1160,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen _NoOpFunctor __get_data_op; return __parallel_transform_scan_base( - __backend_tag, __exec, std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __init, + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), + std::forward<_Range2>(__out_rng), __init, // local scan unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner, _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op, @@ -1283,7 +1283,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag template auto -__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec, +__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng, _Range2&& __result, _BinaryPredicate __pred) { using _Assign = oneapi::dpl::__internal::__pstl_assign; @@ -1316,8 +1316,9 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t decltype(__n)>; using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>; - return __parallel_scan_copy(__backend_tag, __exec, std::forward<_Range1>(__rng), std::forward<_Range2>(__result), - __n, _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}, + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, + _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}, _CopyOp{_ReduceOp{}, _Assign{}}); } @@ -1357,7 +1358,7 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ template auto -__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec, +__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred) { oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size(); @@ -1383,14 +1384,14 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>; using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>; - return __parallel_scan_copy(__backend_tag, __exec, std::forward<_Range1>(__rng), std::forward<_Range2>(__result), - __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}}); + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}}); } template auto -__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec, +__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign __assign = _Assign{}) { using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>; @@ -1440,8 +1441,9 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>; - return __parallel_scan_copy(__backend_tag, __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), - __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign}); + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), + std::forward<_OutRng>(__out_rng), __n, _CreateOp{__pred}, + _CopyOp{_ReduceOp{}, __assign}); } #if _ONEDPL_COMPILE_KERNEL @@ -1534,7 +1536,7 @@ __parallel_set_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, template auto -__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec, +__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp, _IsOpDifference __is_op_difference) { @@ -1552,8 +1554,9 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, c } } #endif - return __parallel_set_scan(__backend_tag, __exec, std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__result), __comp, __is_op_difference); + return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp, + __is_op_difference); } //------------------------------------------------------------------------ @@ -2467,8 +2470,8 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ template oneapi::dpl::__internal::__difference_t<_Range3> -__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, const _ExecutionPolicy& __exec, - _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, +__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, + _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { // The algorithm reduces values in __values where the @@ -2506,7 +2509,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, cons } #endif return __parallel_reduce_by_segment_fallback( - oneapi::dpl::__internal::__device_backend_tag{}, __exec, + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), __binary_pred, __binary_op, oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>{}); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index a6581b36d11..0d2d49ed6d1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -186,13 +186,12 @@ template > { - template + template auto operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, _TransformOp __transform_op, - const __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container, + const __result_and_scratch_storage<_ExecutionPolicy, _Tp>& __scratch_container, _Ranges&&... __rngs) const { auto __transform_pattern = @@ -215,7 +214,7 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size), sycl::range<1>(__work_group_size)), [=](sycl::nd_item<1> __item_id) { auto __temp_ptr = - __result_and_scratch_storage<_ExecutionPolicy2, _Tp>::__get_usm_or_buffer_accessor_ptr( + __result_and_scratch_storage<_ExecutionPolicy, _Tp>::__get_usm_or_buffer_accessor_ptr( __temp_acc); __device_reduce_kernel<_Tp>(__item_id, __n, __iters_per_work_item, __is_full, __n_groups, __transform_pattern, __reduce_pattern, __temp_local, __temp_ptr, @@ -235,12 +234,11 @@ template > { - template + template auto operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, sycl::event& __reduce_event, const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, - _InitType __init, const __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container) const + _InitType __init, const __result_and_scratch_storage<_ExecutionPolicy, _Tp>& __scratch_container) const { using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>; auto __transform_pattern = @@ -250,7 +248,7 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative const bool __is_full = __n == __work_group_size * __iters_per_work_item; - using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy2, _Tp>; + using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _Tp>; __reduce_event = __exec.queue().submit([&, __n](sycl::handler& __cgh) { __cgh.depends_on(__reduce_event); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 215075b7a79..c19538168e2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -530,7 +530,7 @@ struct __result_and_scratch_storage_base }; template -struct __result_and_scratch_storage : __result_and_scratch_storage_base +struct __result_and_scratch_storage_impl : __result_and_scratch_storage_base { private: using __sycl_buffer_t = sycl::buffer<_T, 1>; @@ -578,10 +578,10 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base } public: - __result_and_scratch_storage(const _ExecutionPolicy& __exec_, std::size_t __result_n, std::size_t __scratch_n) + __result_and_scratch_storage_impl(const _ExecutionPolicy& __exec_, std::size_t __result_n, std::size_t __scratch_n) : __exec{__exec_}, __result_n{__result_n}, __scratch_n{__scratch_n}, - __use_USM_host{__use_USM_host_allocations(__exec.queue())}, __supports_USM_device{ - __use_USM_allocations(__exec.queue())} + __use_USM_host{__use_USM_host_allocations(__exec.queue())}, + __supports_USM_device{__use_USM_allocations(__exec.queue())} { const std::size_t __total_n = __scratch_n + __result_n; // Skip in case this is a dummy container @@ -724,6 +724,9 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base } }; +template +using __result_and_scratch_storage = __result_and_scratch_storage_impl, _T>; + // Tag __async_mode describe a pattern call mode which should be executed asynchronously struct __async_mode { @@ -753,9 +756,12 @@ class __future : private std::tuple<_Args...> return __buf.get_host_access(sycl::read_only)[0]; } - template + // Here we use __result_and_scratch_storage_impl rather than __result_and_scratch_storage because we need to + // match the type with the overload and are deducing the policy type. If we used __result_and_scratch_storage, + // it would cause issues in type deduction due to decay of the policy in that using statement. + template constexpr auto - __wait_and_get_value(const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage) + __wait_and_get_value(const __result_and_scratch_storage_impl<_DecayedExecutionPolicy, _T>& __storage) { return __storage.__wait_and_get_value(__my_event); }