diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h index 45124417ade..bf8875aa174 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h @@ -130,7 +130,8 @@ class __histo_kernel_private_glocal_atomics; template void __clear_wglocal_histograms(const _HistAccessor& __local_histogram, const _OffsetT& __offset, _Size __num_bins, - const sycl::nd_item<1>& __self_item) + const sycl::nd_item<1>& __self_item, + __dpl_sycl::__fence_space_t __fence_space = __dpl_sycl::__fence_space_local) { using _BinUint_t = ::std::conditional_t<(sizeof(_Size) >= sizeof(::std::uint32_t)), ::std::uint64_t, ::std::uint32_t>; @@ -148,7 +149,7 @@ __clear_wglocal_histograms(const _HistAccessor& __local_histogram, const _Offset { __local_histogram[__offset + __gSize * __k + __self_lidx] = 0; } - __dpl_sycl::__group_barrier(__self_item); + __dpl_sycl::__group_barrier(__self_item, __fence_space); } template @@ -444,7 +445,8 @@ struct __histogram_general_private_global_atomics_submitter<__internal::__option const ::std::size_t __wgroup_idx = __self_item.get_group(0); const ::std::size_t __seg_start = __work_group_size * __iters_per_work_item * __wgroup_idx; - __clear_wglocal_histograms(__hacc_private, __wgroup_idx * __num_bins, __num_bins, __self_item); + __clear_wglocal_histograms(__hacc_private, __wgroup_idx * __num_bins, __num_bins, __self_item, + __dpl_sycl::__fence_space_global); if (__seg_start + __work_group_size * __iters_per_work_item < __n) { @@ -469,7 +471,7 @@ struct __histogram_general_private_global_atomics_submitter<__internal::__option } } - __dpl_sycl::__group_barrier(__self_item); + __dpl_sycl::__group_barrier(__self_item, __dpl_sycl::__fence_space_global); __reduce_out_histograms<_bin_type, ::std::uint32_t>(__hacc_private, __wgroup_idx * __num_bins, __bins, __num_bins, __self_item); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h index 6dd3b193a08..399fa067983 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h @@ -80,6 +80,12 @@ struct __subgroup_radix_sort { return __dpl_sycl::__local_accessor<_KeyT>(__buf_size, __cgh); } + + inline static constexpr auto + get_fence() + { + return __dpl_sycl::__fence_space_local; + } }; template @@ -94,6 +100,12 @@ struct __subgroup_radix_sort { return sycl::accessor(__buf, __cgh, sycl::read_write, __dpl_sycl::__no_init{}); } + + inline constexpr static auto + get_fence() + { + return __dpl_sycl::__fence_space_global; + } }; template @@ -175,8 +187,9 @@ struct __subgroup_radix_sort //copy(move) values construction __block_load<_ValT>(__wi, __src, __values.__v, __n); + // TODO: check if the barrier can be removed + __dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence()); - __dpl_sycl::__group_barrier(__it); while (true) { uint16_t __indices[__block_size]; //indices for indirect access in the "re-order" phase @@ -205,7 +218,7 @@ struct __subgroup_radix_sort __indices[__i] = *__counters[__i]; *__counters[__i] = __indices[__i] + 1; } - __dpl_sycl::__group_barrier(__it); + __dpl_sycl::__group_barrier(__it, decltype(__buf_count)::get_fence()); //2. scan phase { @@ -218,8 +231,8 @@ struct __subgroup_radix_sort _ONEDPL_PRAGMA_UNROLL for (uint16_t __i = 1; __i < __bin_count; ++__i) __bin_sum[__i] = __bin_sum[__i - 1] + __counter_lacc[__wi * __bin_count + __i]; + __dpl_sycl::__group_barrier(__it, decltype(__buf_count)::get_fence()); - __dpl_sycl::__group_barrier(__it); //exclusive scan local sum uint16_t __sum_scan = __dpl_sycl::__exclusive_scan_over_group( __it.get_group(), __bin_sum[__bin_count - 1], __dpl_sycl::__plus()); @@ -230,7 +243,7 @@ struct __subgroup_radix_sort if (__wi == 0) __counter_lacc[0] = 0; - __dpl_sycl::__group_barrier(__it); + __dpl_sycl::__group_barrier(__it, decltype(__buf_count)::get_fence()); } _ONEDPL_PRAGMA_UNROLL @@ -244,7 +257,7 @@ struct __subgroup_radix_sort __begin_bit += __radix; //3. "re-order" phase - __dpl_sycl::__group_barrier(__it); + __dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence()); if (__begin_bit >= __end_bit) { // the last iteration - writing out the result @@ -268,7 +281,6 @@ struct __subgroup_radix_sort if (__idx < __n) __exchange_lacc[__idx].~_ValT(); } - return; } @@ -293,8 +305,7 @@ struct __subgroup_radix_sort __exchange_lacc[__r] = ::std::move(__values.__v[__i]); } } - - __dpl_sycl::__group_barrier(__it); + __dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence()); _ONEDPL_PRAGMA_UNROLL for (uint16_t __i = 0; __i < __block_size; ++__i) @@ -303,8 +314,7 @@ struct __subgroup_radix_sort if (__idx < __n) __values.__v[__i] = ::std::move(__exchange_lacc[__idx]); } - - __dpl_sycl::__group_barrier(__it); + __dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence()); } })); }); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h index abce0902be1..9bc4cc94185 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h @@ -69,6 +69,7 @@ #define _ONEDPL_SYCL2020_KNOWN_IDENTITY_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300)) #define _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300)) #define _ONEDPL_SYCL2020_REQD_SUB_GROUP_SIZE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300)) +#define _ONEDPL_SYCL2020_GROUP_BARRIER_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300)) #define _ONEDPL_SYCL2020_TARGET_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50400)) #define _ONEDPL_SYCL2020_TARGET_DEVICE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50400)) #define _ONEDPL_SYCL2020_ATOMIC_REF_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50500)) @@ -225,17 +226,44 @@ __get_accessor_size(const _Accessor& __accessor) #endif } +// TODO: switch to SYCL 2020 with DPC++ compiler. +// SYCL 1.2.1 version is used due to having an API with a local memory fence, +// which gives better performance on Intel GPUs. +// The performance gap is negligible since +// https://github.com/intel/intel-graphics-compiler/commit/ed639f68d142bc963a7b626badc207a42fb281cb (Aug 20, 2024) +// But the fix is not a part of the LTS GPU drivers (Linux) yet. +// +// This macro may also serve as a temporary workaround to strengthen the barriers +// if there are cases where the memory ordering is not strong enough. +#if !defined(_ONEDPL_SYCL121_GROUP_BARRIER) +# if _ONEDPL_LIBSYCL_VERSION +# define _ONEDPL_SYCL121_GROUP_BARRIER 1 +# else +// For safety, assume that other SYCL implementations comply with SYCL 2020, which is a oneDPL requirement. +# define _ONEDPL_SYCL121_GROUP_BARRIER 0 +# endif +#endif + +#if _ONEDPL_SYCL121_GROUP_BARRIER +using __fence_space_t = sycl::access::fence_space; +inline constexpr __fence_space_t __fence_space_local = sycl::access::fence_space::local_space; +inline constexpr __fence_space_t __fence_space_global = sycl::access::fence_space::global_space; +#else +struct __fence_space_t{}; // No-op dummy type since SYCL 2020 does not specify memory fence spaces in group barriers +inline constexpr __fence_space_t __fence_space_local{}; +inline constexpr __fence_space_t __fence_space_global{}; +#endif // _ONEDPL_SYCL121_GROUP_BARRIER + template -constexpr void -__group_barrier(_Item __item) +void +__group_barrier(_Item __item, [[maybe_unused]] __dpl_sycl::__fence_space_t __space = __fence_space_local) { -#if 0 // !defined(_ONEDPL_LIBSYCL_VERSION) || _ONEDPL_LIBSYCL_VERSION >= 50300 - //TODO: usage of sycl::group_barrier: probably, we have to revise SYCL parallel patterns which use a group_barrier. - // 1) sycl::group_barrier() implementation is not ready - // 2) sycl::group_barrier and sycl::item::group_barrier are not quite equivalent +#if _ONEDPL_SYCL121_GROUP_BARRIER + __item.barrier(__space); +#elif _ONEDPL_SYCL2020_GROUP_BARRIER_PRESENT sycl::group_barrier(__item.get_group(), sycl::memory_scope::work_group); #else - __item.barrier(sycl::access::fence_space::local_space); +# error "sycl::group_barrier is not supported, and no alternative is available" #endif }