Skip to content

Commit

Permalink
Bugfix to decay Policy for __result_and_scratch_storage (#2031)
Browse files Browse the repository at this point in the history
Signed-off-by: Dan Hoeflinger <[email protected]>
  • Loading branch information
danhoeflinger authored Jan 31, 2025
1 parent 4cbf18c commit 7bbaf83
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 36 deletions.
47 changes: 25 additions & 22 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -615,8 +615,7 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend

// Although we do not actually need result storage in this case, we need to construct
// a placeholder here to match the return type of the non-single-work-group implementation
using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _ValueType>;
__result_and_scratch_storage_t __dummy_result_and_scratch{__exec, 0, 0};
__result_and_scratch_storage<_ExecutionPolicy, _ValueType> __dummy_result_and_scratch{__exec, 0, 0};

if (__max_wg_size >= __targeted_wg_size)
{
Expand Down Expand Up @@ -1093,7 +1092,7 @@ struct __write_to_id_if_else
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation, typename _InitType,
typename _BinaryOperation, typename _Inclusive>
auto
__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __in_rng, _Range2&& __out_rng, std::size_t __n, _UnaryOperation __unary_op,
_InitType __init, _BinaryOperation __binary_op, _Inclusive)
{
Expand Down Expand Up @@ -1122,9 +1121,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
std::size_t __single_group_upper_limit = __use_reduce_then_scan ? 2048 : 16384;
if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform, __single_group_upper_limit))
{
return __parallel_transform_scan_single_group(__backend_tag, __exec, std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __n, __unary_op, __init,
__binary_op, _Inclusive{});
return __parallel_transform_scan_single_group(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
}
}
#if _ONEDPL_COMPILE_KERNEL
Expand Down Expand Up @@ -1161,7 +1160,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
_NoOpFunctor __get_data_op;

return __parallel_transform_scan_base(
__backend_tag, __exec, std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __init,
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __init,
// local scan
unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
_NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op,
Expand Down Expand Up @@ -1283,7 +1283,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinaryPredicate>
auto
__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng, _Range2&& __result, _BinaryPredicate __pred)
{
using _Assign = oneapi::dpl::__internal::__pstl_assign;
Expand Down Expand Up @@ -1316,8 +1316,9 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
decltype(__n)>;
using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>;

return __parallel_scan_copy(__backend_tag, __exec, std::forward<_Range1>(__rng), std::forward<_Range2>(__result),
__n, _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
std::forward<_Range2>(__result), __n,
_CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
_CopyOp{_ReduceOp{}, _Assign{}});
}

Expand Down Expand Up @@ -1357,7 +1358,7 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>
auto
__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred)
{
oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size();
Expand All @@ -1383,14 +1384,14 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen
using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>;
using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>;

return __parallel_scan_copy(__backend_tag, __exec, std::forward<_Range1>(__rng), std::forward<_Range2>(__result),
__n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
}

template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred,
typename _Assign = oneapi::dpl::__internal::__pstl_assign>
auto
__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign __assign = _Assign{})
{
using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>;
Expand Down Expand Up @@ -1440,8 +1441,9 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
/*inclusive*/ std::true_type, 1>;

return __parallel_scan_copy(__backend_tag, __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
__n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign});
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
std::forward<_OutRng>(__out_rng), __n, _CreateOp{__pred},
_CopyOp{_ReduceOp{}, __assign});
}

#if _ONEDPL_COMPILE_KERNEL
Expand Down Expand Up @@ -1534,7 +1536,7 @@ __parallel_set_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
typename _IsOpDifference>
auto
__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
_IsOpDifference __is_op_difference)
{
Expand All @@ -1552,8 +1554,9 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, c
}
}
#endif
return __parallel_set_scan(__backend_tag, __exec, std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
std::forward<_Range3>(__result), __comp, __is_op_difference);
return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp,
__is_op_difference);
}

//------------------------------------------------------------------------
Expand Down Expand Up @@ -2467,8 +2470,8 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
typename _BinaryPredicate, typename _BinaryOperator>
oneapi::dpl::__internal::__difference_t<_Range3>
__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, const _ExecutionPolicy& __exec,
_Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys,
_Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
_BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
{
// The algorithm reduces values in __values where the
Expand Down Expand Up @@ -2506,7 +2509,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, cons
}
#endif
return __parallel_reduce_by_segment_fallback(
oneapi::dpl::__internal::__device_backend_tag{}, __exec,
oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec),
std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
std::forward<_Range4>(__out_values), __binary_pred, __binary_op,
oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>{});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,13 +186,12 @@ template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename..
struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _VecSize,
__internal::__optional_kernel_name<_KernelName...>>
{
template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _TransformOp,
typename _ExecutionPolicy2, typename... _Ranges>
template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _TransformOp, typename... _Ranges>
auto
operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, const _Size __n,
const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op,
_TransformOp __transform_op,
const __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container,
const __result_and_scratch_storage<_ExecutionPolicy, _Tp>& __scratch_container,
_Ranges&&... __rngs) const
{
auto __transform_pattern =
Expand All @@ -215,7 +214,7 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V
sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size), sycl::range<1>(__work_group_size)),
[=](sycl::nd_item<1> __item_id) {
auto __temp_ptr =
__result_and_scratch_storage<_ExecutionPolicy2, _Tp>::__get_usm_or_buffer_accessor_ptr(
__result_and_scratch_storage<_ExecutionPolicy, _Tp>::__get_usm_or_buffer_accessor_ptr(
__temp_acc);
__device_reduce_kernel<_Tp>(__item_id, __n, __iters_per_work_item, __is_full, __n_groups,
__transform_pattern, __reduce_pattern, __temp_local, __temp_ptr,
Expand All @@ -235,12 +234,11 @@ template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename..
struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative, _VecSize,
__internal::__optional_kernel_name<_KernelName...>>
{
template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _InitType,
typename _ExecutionPolicy2>
template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _InitType>
auto
operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, sycl::event& __reduce_event,
const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op,
_InitType __init, const __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container) const
_InitType __init, const __result_and_scratch_storage<_ExecutionPolicy, _Tp>& __scratch_container) const
{
using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
auto __transform_pattern =
Expand All @@ -250,7 +248,7 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative

const bool __is_full = __n == __work_group_size * __iters_per_work_item;

using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy2, _Tp>;
using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _Tp>;

__reduce_event = __exec.queue().submit([&, __n](sycl::handler& __cgh) {
__cgh.depends_on(__reduce_event);
Expand Down
18 changes: 12 additions & 6 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ struct __result_and_scratch_storage_base
};

template <typename _ExecutionPolicy, typename _T>
struct __result_and_scratch_storage : __result_and_scratch_storage_base
struct __result_and_scratch_storage_impl : __result_and_scratch_storage_base
{
private:
using __sycl_buffer_t = sycl::buffer<_T, 1>;
Expand Down Expand Up @@ -578,10 +578,10 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
}

public:
__result_and_scratch_storage(const _ExecutionPolicy& __exec_, std::size_t __result_n, std::size_t __scratch_n)
__result_and_scratch_storage_impl(const _ExecutionPolicy& __exec_, std::size_t __result_n, std::size_t __scratch_n)
: __exec{__exec_}, __result_n{__result_n}, __scratch_n{__scratch_n},
__use_USM_host{__use_USM_host_allocations(__exec.queue())}, __supports_USM_device{
__use_USM_allocations(__exec.queue())}
__use_USM_host{__use_USM_host_allocations(__exec.queue())},
__supports_USM_device{__use_USM_allocations(__exec.queue())}
{
const std::size_t __total_n = __scratch_n + __result_n;
// Skip in case this is a dummy container
Expand Down Expand Up @@ -724,6 +724,9 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
}
};

template <typename _ExecutionPolicy, typename _T>
using __result_and_scratch_storage = __result_and_scratch_storage_impl<std::decay_t<_ExecutionPolicy>, _T>;

// Tag __async_mode describe a pattern call mode which should be executed asynchronously
struct __async_mode
{
Expand Down Expand Up @@ -753,9 +756,12 @@ class __future : private std::tuple<_Args...>
return __buf.get_host_access(sycl::read_only)[0];
}

template <typename _ExecutionPolicy, typename _T>
// Here we use __result_and_scratch_storage_impl rather than __result_and_scratch_storage because we need to
// match the type with the overload and are deducing the policy type. If we used __result_and_scratch_storage,
// it would cause issues in type deduction due to decay of the policy in that using statement.
template <typename _DecayedExecutionPolicy, typename _T>
constexpr auto
__wait_and_get_value(const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage)
__wait_and_get_value(const __result_and_scratch_storage_impl<_DecayedExecutionPolicy, _T>& __storage)
{
return __storage.__wait_and_get_value(__my_event);
}
Expand Down

0 comments on commit 7bbaf83

Please sign in to comment.