Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix to decay Policy for __result_and_scratch_storage #2031

Merged
merged 7 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 43 additions & 38 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
auto __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __size_per_wg);
// Storage for the results of scan for each workgroup

using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _Type>;
using __result_and_scratch_storage_t = __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _Type>;
__result_and_scratch_storage_t __result_and_scratch{__exec, 1, __n_groups + 1};

_PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu);
Expand Down Expand Up @@ -539,7 +539,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
std::decay_t<decltype(__out_rng[0])>>::__type;

constexpr ::std::uint32_t __elems_per_wg = _ElemsPerItem * _WGSize;
using __result_and_scratch_storage_t = __result_and_scratch_storage<_Policy, _Size>;
using __result_and_scratch_storage_t = __result_and_scratch_storage<std::decay_t<_Policy>, _Size>;
__result_and_scratch_storage_t __result{__policy, 1, 0};

auto __event = __policy.queue().submit([&](sycl::handler& __hdl) {
Expand Down Expand Up @@ -615,7 +615,7 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend

// Although we do not actually need result storage in this case, we need to construct
// a placeholder here to match the return type of the non-single-work-group implementation
using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _ValueType>;
using __result_and_scratch_storage_t = __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _ValueType>;
__result_and_scratch_storage_t __dummy_result_and_scratch{__exec, 0, 0};

if (__max_wg_size >= __targeted_wg_size)
Expand Down Expand Up @@ -1093,7 +1093,7 @@ struct __write_to_id_if_else
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation, typename _InitType,
typename _BinaryOperation, typename _Inclusive>
auto
__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __in_rng, _Range2&& __out_rng, std::size_t __n, _UnaryOperation __unary_op,
_InitType __init, _BinaryOperation __binary_op, _Inclusive)
{
Expand Down Expand Up @@ -1122,9 +1122,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
std::size_t __single_group_upper_limit = __use_reduce_then_scan ? 2048 : 16384;
if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform, __single_group_upper_limit))
{
return __parallel_transform_scan_single_group(__backend_tag, __exec, std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __n, __unary_op, __init,
__binary_op, _Inclusive{});
return __parallel_transform_scan_single_group(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
}
}
#if _ONEDPL_COMPILE_KERNEL
Expand All @@ -1137,10 +1137,10 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
_GenInput __gen_transform{__unary_op};
try
{
return __parallel_transform_reduce_then_scan(__backend_tag, __exec, __in_rng, __out_rng,
__gen_transform, __binary_op, __gen_transform,
_ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{},
/*_IsUniquePattern=*/std::false_type{});
return __parallel_transform_reduce_then_scan(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), __in_rng, __out_rng, __gen_transform,
mmichel11 marked this conversation as resolved.
Show resolved Hide resolved
__binary_op, __gen_transform, _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{},
/*_IsUniquePattern=*/std::false_type{});
}
catch (const sycl::exception& __e)
{
Expand All @@ -1161,7 +1161,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
_NoOpFunctor __get_data_op;

return __parallel_transform_scan_base(
__backend_tag, __exec, std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __init,
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __init,
// local scan
unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
_NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op,
Expand Down Expand Up @@ -1283,7 +1284,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinaryPredicate>
auto
__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng, _Range2&& __result, _BinaryPredicate __pred)
{
using _Assign = oneapi::dpl::__internal::__pstl_assign;
Expand All @@ -1300,8 +1301,8 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>;
try
{
return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __rng, __result, __n, _GenMask{__pred},
_WriteOp{_Assign{}},
return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), __rng,
__result, __n, _GenMask{__pred}, _WriteOp{_Assign{}},
/*_IsUniquePattern=*/std::true_type{});
}
catch (const sycl::exception& __e)
Expand All @@ -1316,8 +1317,9 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
decltype(__n)>;
using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>;

return __parallel_scan_copy(__backend_tag, __exec, std::forward<_Range1>(__rng), std::forward<_Range2>(__result),
__n, _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
std::forward<_Range2>(__result), __n,
_CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
_CopyOp{_ReduceOp{}, _Assign{}});
}

Expand Down Expand Up @@ -1357,7 +1359,7 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>
auto
__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred)
{
oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size();
Expand All @@ -1369,8 +1371,8 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen
oneapi::dpl::__par_backend_hetero::__write_to_id_if_else<oneapi::dpl::__internal::__pstl_assign>;
try
{
return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __rng, __result, __n, _GenMask{__pred},
_WriteOp{},
return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), __rng,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed.

__result, __n, _GenMask{__pred}, _WriteOp{},
/*_IsUniquePattern=*/std::false_type{});
}
catch (const sycl::exception& __e)
Expand All @@ -1383,14 +1385,14 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen
using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>;
using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>;

return __parallel_scan_copy(__backend_tag, __exec, std::forward<_Range1>(__rng), std::forward<_Range2>(__result),
__n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
}

template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred,
typename _Assign = oneapi::dpl::__internal::__pstl_assign>
auto
__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign __assign = _Assign{})
{
using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>;
Expand Down Expand Up @@ -1425,8 +1427,8 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>;
try
{
return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __in_rng, __out_rng, __n, _GenMask{__pred},
_WriteOp{__assign},
return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), __in_rng,
__out_rng, __n, _GenMask{__pred}, _WriteOp{__assign},
/*_IsUniquePattern=*/std::false_type{});
}
catch (const sycl::exception& __e)
Expand All @@ -1440,8 +1442,9 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
/*inclusive*/ std::true_type, 1>;

return __parallel_scan_copy(__backend_tag, __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
__n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign});
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
std::forward<_OutRng>(__out_rng), __n, _CreateOp{__pred},
_CopyOp{_ReduceOp{}, __assign});
}

#if _ONEDPL_COMPILE_KERNEL
Expand Down Expand Up @@ -1534,7 +1537,7 @@ __parallel_set_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
typename _IsOpDifference>
auto
__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
_IsOpDifference __is_op_difference)
{
Expand All @@ -1543,17 +1546,18 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, c
{
try
{
return __parallel_set_reduce_then_scan(__backend_tag, __exec, __rng1, __rng2, __result, __comp,
__is_op_difference);
return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), __rng1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed.

__rng2, __result, __comp, __is_op_difference);
}
catch (const sycl::exception& __e)
{
__bypass_sycl_kernel_not_supported(__e);
}
}
#endif
return __parallel_set_scan(__backend_tag, __exec, std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
std::forward<_Range3>(__result), __comp, __is_op_difference);
return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp,
__is_op_difference);
}

//------------------------------------------------------------------------
Expand Down Expand Up @@ -1822,7 +1826,8 @@ struct __parallel_find_or_impl_one_wg<__or_tag_check, __internal::__optional_ker
const std::size_t __rng_n, const std::size_t __wgroup_size, const __FoundStateType __init_value,
_Predicate __pred, _Ranges&&... __rngs)
{
using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, __FoundStateType>;
using __result_and_scratch_storage_t =
__result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, __FoundStateType>;
__result_and_scratch_storage_t __result_storage{__exec, 1, 0};

// Calculate the number of elements to be processed by each work-item.
Expand Down Expand Up @@ -2467,8 +2472,8 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
typename _BinaryPredicate, typename _BinaryOperator>
oneapi::dpl::__internal::__difference_t<_Range3>
__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, const _ExecutionPolicy& __exec,
_Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys,
_Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
_BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
{
// The algorithm reduces values in __values where the
Expand All @@ -2492,8 +2497,8 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, cons
try
{
auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan(
oneapi::dpl::__internal::__device_backend_tag{}, __exec, __keys, __values, __out_keys, __out_values,
__binary_pred, __binary_op);
oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __keys,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed.

__values, __out_keys, __out_values, __binary_pred, __binary_op);
// Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
// past-the-end iterator pair of segmented reduction.
return std::get<0>(__res.get()) + 1;
Expand All @@ -2506,7 +2511,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, cons
}
#endif
return __parallel_reduce_by_segment_fallback(
oneapi::dpl::__internal::__device_backend_tag{}, __exec,
oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec),
std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
std::forward<_Range4>(__out_values), __binary_pred, __binary_op,
oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>{});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,

// Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group)
auto __p_base_diagonals_sp_global_storage =
new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>(
new __result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _split_point_t<_IdType>>(
__exec, 0, __nd_range_params.base_diag_count + 1);

// Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
// Calculate nd-range params
const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n, __n_sorted);

using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _merge_split_point_t>;
using __base_diagonals_sp_storage_t =
__result_and_scratch_storage<std::decay_t<_ExecutionPolicy>, _merge_split_point_t>;

const std::size_t __n_power2 = oneapi::dpl::__internal::__dpl_bit_ceil(__n);
// ctz precisely calculates log2 of an integral value which is a power of 2, while
Expand Down
Loading
Loading