From 7bbaf83bde5043524e0ffae85a74df87a22203a1 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com>
Date: Fri, 31 Jan 2025 14:11:21 -0500
Subject: [PATCH] Bugfix to decay Policy for `__result_and_scratch_storage`
 (#2031)

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 47 ++++++++++---------
 .../dpcpp/parallel_backend_sycl_reduce.h      | 14 +++---
 .../dpcpp/parallel_backend_sycl_utils.h       | 18 ++++---
 3 files changed, 43 insertions(+), 36 deletions(-)
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 36e5e165685..a1fcc1f8e15 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -615,8 +615,7 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend
 
     // Although we do not actually need result storage in this case, we need to construct
     // a placeholder here to match the return type of the non-single-work-group implementation
-    using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _ValueType>;
-    __result_and_scratch_storage_t __dummy_result_and_scratch{__exec, 0, 0};
+    __result_and_scratch_storage<_ExecutionPolicy, _ValueType> __dummy_result_and_scratch{__exec, 0, 0};
 
     if (__max_wg_size >= __targeted_wg_size)
     {
@@ -1093,7 +1092,7 @@ struct __write_to_id_if_else
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation, typename _InitType,
           typename _BinaryOperation, typename _Inclusive>
 auto
-__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
+__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
                           _Range1&& __in_rng, _Range2&& __out_rng, std::size_t __n, _UnaryOperation __unary_op,
                           _InitType __init, _BinaryOperation __binary_op, _Inclusive)
 {
@@ -1122,9 +1121,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
             std::size_t __single_group_upper_limit = __use_reduce_then_scan ? 2048 : 16384;
             if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform, __single_group_upper_limit))
             {
-                return __parallel_transform_scan_single_group(__backend_tag, __exec, std::forward<_Range1>(__in_rng),
-                                                              std::forward<_Range2>(__out_rng), __n, __unary_op, __init,
-                                                              __binary_op, _Inclusive{});
+                return __parallel_transform_scan_single_group(
+                    __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
+                    std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
             }
         }
 #if _ONEDPL_COMPILE_KERNEL
@@ -1161,7 +1160,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
     _NoOpFunctor __get_data_op;
 
     return __parallel_transform_scan_base(
-        __backend_tag, __exec, std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __init,
+        __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
+        std::forward<_Range2>(__out_rng), __init,
         // local scan
         unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
                               _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op,
@@ -1283,7 +1283,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
 
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinaryPredicate>
 auto
-__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
+__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
                        _Range1&& __rng, _Range2&& __result, _BinaryPredicate __pred)
 {
     using _Assign = oneapi::dpl::__internal::__pstl_assign;
@@ -1316,8 +1316,9 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
                                                            decltype(__n)>;
     using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>;
 
-    return __parallel_scan_copy(__backend_tag, __exec, std::forward<_Range1>(__rng), std::forward<_Range2>(__result),
-                                __n, _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
+    return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
+                                std::forward<_Range2>(__result), __n,
+                                _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
                                 _CopyOp{_ReduceOp{}, _Assign{}});
 }
 
@@ -1357,7 +1358,7 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_
 
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>
 auto
-__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
+__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
                           _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred)
 {
     oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size();
@@ -1383,14 +1384,14 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen
     using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>;
     using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>;
 
-    return __parallel_scan_copy(__backend_tag, __exec, std::forward<_Range1>(__rng), std::forward<_Range2>(__result),
-                                __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
+    return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
+                                std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
 }
 
 template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred,
           typename _Assign = oneapi::dpl::__internal::__pstl_assign>
 auto
-__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
+__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
                    _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign __assign = _Assign{})
 {
     using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>;
@@ -1440,8 +1441,9 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
     using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
                                                   /*inclusive*/ std::true_type, 1>;
 
-    return __parallel_scan_copy(__backend_tag, __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
-                                __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign});
+    return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
+                                std::forward<_OutRng>(__out_rng), __n, _CreateOp{__pred},
+                                _CopyOp{_ReduceOp{}, __assign});
 }
 
 #if _ONEDPL_COMPILE_KERNEL
@@ -1534,7 +1536,7 @@ __parallel_set_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
           typename _IsOpDifference>
 auto
-__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
+__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
                   _Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
                   _IsOpDifference __is_op_difference)
 {
@@ -1552,8 +1554,9 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, c
         }
     }
 #endif
-    return __parallel_set_scan(__backend_tag, __exec, std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
-                               std::forward<_Range3>(__result), __comp, __is_op_difference);
+    return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
+                               std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp,
+                               __is_op_difference);
 }
 
 //------------------------------------------------------------------------
@@ -2467,8 +2470,8 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
           typename _BinaryPredicate, typename _BinaryOperator>
 oneapi::dpl::__internal::__difference_t<_Range3>
-__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, const _ExecutionPolicy& __exec,
-                             _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
+__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys,
+                             _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
                              _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
 {
     // The algorithm reduces values in __values where the
@@ -2506,7 +2509,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, cons
     }
 #endif
     return __parallel_reduce_by_segment_fallback(
-        oneapi::dpl::__internal::__device_backend_tag{}, __exec,
+        oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec),
         std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
         std::forward<_Range4>(__out_values), __binary_pred, __binary_op,
         oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>{});
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
index a6581b36d11..0d2d49ed6d1 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
@@ -186,13 +186,12 @@ template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename..
 struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _VecSize,
                                                            __internal::__optional_kernel_name<_KernelName...>>
 {
-    template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _TransformOp,
-              typename _ExecutionPolicy2, typename... _Ranges>
+    template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _TransformOp, typename... _Ranges>
     auto
     operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, const _Size __n,
                const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op,
                _TransformOp __transform_op,
-               const __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container,
+               const __result_and_scratch_storage<_ExecutionPolicy, _Tp>& __scratch_container,
                _Ranges&&... __rngs) const
     {
         auto __transform_pattern =
@@ -215,7 +214,7 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V
                 sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size), sycl::range<1>(__work_group_size)),
                 [=](sycl::nd_item<1> __item_id) {
                     auto __temp_ptr =
-                        __result_and_scratch_storage<_ExecutionPolicy2, _Tp>::__get_usm_or_buffer_accessor_ptr(
+                        __result_and_scratch_storage<_ExecutionPolicy, _Tp>::__get_usm_or_buffer_accessor_ptr(
                             __temp_acc);
                     __device_reduce_kernel<_Tp>(__item_id, __n, __iters_per_work_item, __is_full, __n_groups,
                                                 __transform_pattern, __reduce_pattern, __temp_local, __temp_ptr,
@@ -235,12 +234,11 @@ template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename..
 struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative, _VecSize,
                                                                __internal::__optional_kernel_name<_KernelName...>>
 {
-    template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _InitType,
-              typename _ExecutionPolicy2>
+    template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _InitType>
     auto
     operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, sycl::event& __reduce_event,
                const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op,
-               _InitType __init, const __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container) const
+               _InitType __init, const __result_and_scratch_storage<_ExecutionPolicy, _Tp>& __scratch_container) const
     {
         using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
         auto __transform_pattern =
@@ -250,7 +248,7 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative
 
         const bool __is_full = __n == __work_group_size * __iters_per_work_item;
 
-        using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy2, _Tp>;
+        using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _Tp>;
 
         __reduce_event = __exec.queue().submit([&, __n](sycl::handler& __cgh) {
             __cgh.depends_on(__reduce_event);
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 215075b7a79..c19538168e2 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -530,7 +530,7 @@ struct __result_and_scratch_storage_base
 };
 
 template <typename _ExecutionPolicy, typename _T>
-struct __result_and_scratch_storage : __result_and_scratch_storage_base
+struct __result_and_scratch_storage_impl : __result_and_scratch_storage_base
 {
   private:
     using __sycl_buffer_t = sycl::buffer<_T, 1>;
@@ -578,10 +578,10 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
     }
 
   public:
-    __result_and_scratch_storage(const _ExecutionPolicy& __exec_, std::size_t __result_n, std::size_t __scratch_n)
+    __result_and_scratch_storage_impl(const _ExecutionPolicy& __exec_, std::size_t __result_n, std::size_t __scratch_n)
         : __exec{__exec_}, __result_n{__result_n}, __scratch_n{__scratch_n},
-          __use_USM_host{__use_USM_host_allocations(__exec.queue())}, __supports_USM_device{
-                                                                          __use_USM_allocations(__exec.queue())}
+          __use_USM_host{__use_USM_host_allocations(__exec.queue())},
+          __supports_USM_device{__use_USM_allocations(__exec.queue())}
     {
         const std::size_t __total_n = __scratch_n + __result_n;
         // Skip in case this is a dummy container
@@ -724,6 +724,9 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
     }
 };
 
+template <typename _ExecutionPolicy, typename _T>
+using __result_and_scratch_storage = __result_and_scratch_storage_impl<std::decay_t<_ExecutionPolicy>, _T>;
+
 // Tag __async_mode describe a pattern call mode which should be executed asynchronously
 struct __async_mode
 {
@@ -753,9 +756,12 @@ class __future : private std::tuple<_Args...>
         return __buf.get_host_access(sycl::read_only)[0];
     }
 
-    template <typename _ExecutionPolicy, typename _T>
+    // Here we use __result_and_scratch_storage_impl rather than __result_and_scratch_storage because we need to
+    // match the type with the overload and are deducing the policy type. If we used __result_and_scratch_storage,
+    // it would cause issues in type deduction due to decay of the policy in that using statement.
+    template <typename _DecayedExecutionPolicy, typename _T>
     constexpr auto
-    __wait_and_get_value(const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage)
+    __wait_and_get_value(const __result_and_scratch_storage_impl<_DecayedExecutionPolicy, _T>& __storage)
     {
         return __storage.__wait_and_get_value(__my_event);
     }