Skip to content

Commit

Permalink
Fix synchronization issues in radix sort and histogram (#2054)
Browse files Browse the repository at this point in the history
Co-authored-by: Dan Hoeflinger <[email protected]>
Co-authored-by: Sergey Kopienko <[email protected]>
  • Loading branch information
3 people authored and timmiesmith committed Feb 11, 2025
1 parent ad4c54e commit 47c5daf
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ class __histo_kernel_private_glocal_atomics;
template <typename _HistAccessor, typename _OffsetT, typename _Size>
void
__clear_wglocal_histograms(const _HistAccessor& __local_histogram, const _OffsetT& __offset, _Size __num_bins,
const sycl::nd_item<1>& __self_item)
const sycl::nd_item<1>& __self_item,
__dpl_sycl::__fence_space_t __fence_space = __dpl_sycl::__fence_space_local)
{
using _BinUint_t =
::std::conditional_t<(sizeof(_Size) >= sizeof(::std::uint32_t)), ::std::uint64_t, ::std::uint32_t>;
Expand All @@ -148,7 +149,7 @@ __clear_wglocal_histograms(const _HistAccessor& __local_histogram, const _Offset
{
__local_histogram[__offset + __gSize * __k + __self_lidx] = 0;
}
__dpl_sycl::__group_barrier(__self_item);
__dpl_sycl::__group_barrier(__self_item, __fence_space);
}

template <typename _BinIdxType, typename _ValueType, typename _HistReg, typename _BinFunc>
Expand Down Expand Up @@ -444,7 +445,8 @@ struct __histogram_general_private_global_atomics_submitter<__internal::__option
const ::std::size_t __wgroup_idx = __self_item.get_group(0);
const ::std::size_t __seg_start = __work_group_size * __iters_per_work_item * __wgroup_idx;

__clear_wglocal_histograms(__hacc_private, __wgroup_idx * __num_bins, __num_bins, __self_item);
__clear_wglocal_histograms(__hacc_private, __wgroup_idx * __num_bins, __num_bins, __self_item,
__dpl_sycl::__fence_space_global);

if (__seg_start + __work_group_size * __iters_per_work_item < __n)
{
Expand All @@ -469,7 +471,7 @@ struct __histogram_general_private_global_atomics_submitter<__internal::__option
}
}

__dpl_sycl::__group_barrier(__self_item);
__dpl_sycl::__group_barrier(__self_item, __dpl_sycl::__fence_space_global);

__reduce_out_histograms<_bin_type, ::std::uint32_t>(__hacc_private, __wgroup_idx * __num_bins,
__bins, __num_bins, __self_item);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ struct __subgroup_radix_sort
{
return __dpl_sycl::__local_accessor<_KeyT>(__buf_size, __cgh);
}

inline static constexpr auto
get_fence()
{
return __dpl_sycl::__fence_space_local;
}
};

template <typename _KeyT>
Expand All @@ -94,6 +100,12 @@ struct __subgroup_radix_sort
{
return sycl::accessor(__buf, __cgh, sycl::read_write, __dpl_sycl::__no_init{});
}

inline constexpr static auto
get_fence()
{
return __dpl_sycl::__fence_space_global;
}
};

template <typename _ValueT, typename _Wi, typename _Src, typename _Values>
Expand Down Expand Up @@ -175,8 +187,9 @@ struct __subgroup_radix_sort

//copy(move) values construction
__block_load<_ValT>(__wi, __src, __values.__v, __n);
// TODO: check if the barrier can be removed
__dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence());

__dpl_sycl::__group_barrier(__it);
while (true)
{
uint16_t __indices[__block_size]; //indices for indirect access in the "re-order" phase
Expand Down Expand Up @@ -205,7 +218,7 @@ struct __subgroup_radix_sort
__indices[__i] = *__counters[__i];
*__counters[__i] = __indices[__i] + 1;
}
__dpl_sycl::__group_barrier(__it);
__dpl_sycl::__group_barrier(__it, decltype(__buf_count)::get_fence());

//2. scan phase
{
Expand All @@ -218,8 +231,8 @@ struct __subgroup_radix_sort
_ONEDPL_PRAGMA_UNROLL
for (uint16_t __i = 1; __i < __bin_count; ++__i)
__bin_sum[__i] = __bin_sum[__i - 1] + __counter_lacc[__wi * __bin_count + __i];
__dpl_sycl::__group_barrier(__it, decltype(__buf_count)::get_fence());

__dpl_sycl::__group_barrier(__it);
//exclusive scan local sum
uint16_t __sum_scan = __dpl_sycl::__exclusive_scan_over_group(
__it.get_group(), __bin_sum[__bin_count - 1], __dpl_sycl::__plus<uint16_t>());
Expand All @@ -230,7 +243,7 @@ struct __subgroup_radix_sort

if (__wi == 0)
__counter_lacc[0] = 0;
__dpl_sycl::__group_barrier(__it);
__dpl_sycl::__group_barrier(__it, decltype(__buf_count)::get_fence());
}

_ONEDPL_PRAGMA_UNROLL
Expand All @@ -244,7 +257,7 @@ struct __subgroup_radix_sort
__begin_bit += __radix;

//3. "re-order" phase
__dpl_sycl::__group_barrier(__it);
__dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence());
if (__begin_bit >= __end_bit)
{
// the last iteration - writing out the result
Expand All @@ -268,7 +281,6 @@ struct __subgroup_radix_sort
if (__idx < __n)
__exchange_lacc[__idx].~_ValT();
}

return;
}

Expand All @@ -293,8 +305,7 @@ struct __subgroup_radix_sort
__exchange_lacc[__r] = ::std::move(__values.__v[__i]);
}
}

__dpl_sycl::__group_barrier(__it);
__dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence());

_ONEDPL_PRAGMA_UNROLL
for (uint16_t __i = 0; __i < __block_size; ++__i)
Expand All @@ -303,8 +314,7 @@ struct __subgroup_radix_sort
if (__idx < __n)
__values.__v[__i] = ::std::move(__exchange_lacc[__idx]);
}

__dpl_sycl::__group_barrier(__it);
__dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence());
}
}));
});
Expand Down
42 changes: 35 additions & 7 deletions include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
#define _ONEDPL_SYCL2020_KNOWN_IDENTITY_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
#define _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
#define _ONEDPL_SYCL2020_REQD_SUB_GROUP_SIZE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
#define _ONEDPL_SYCL2020_GROUP_BARRIER_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
#define _ONEDPL_SYCL2020_TARGET_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50400))
#define _ONEDPL_SYCL2020_TARGET_DEVICE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50400))
#define _ONEDPL_SYCL2020_ATOMIC_REF_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50500))
Expand Down Expand Up @@ -225,17 +226,44 @@ __get_accessor_size(const _Accessor& __accessor)
#endif
}

// TODO: switch to SYCL 2020 with DPC++ compiler.
// SYCL 1.2.1 version is used due to having an API with a local memory fence,
// which gives better performance on Intel GPUs.
// The performance gap is negligible since
// https://github.com/intel/intel-graphics-compiler/commit/ed639f68d142bc963a7b626badc207a42fb281cb (Aug 20, 2024)
// But the fix is not a part of the LTS GPU drivers (Linux) yet.
//
// This macro may also serve as a temporary workaround to strengthen the barriers
// if there are cases where the memory ordering is not strong enough.
#if !defined(_ONEDPL_SYCL121_GROUP_BARRIER)
# if _ONEDPL_LIBSYCL_VERSION
# define _ONEDPL_SYCL121_GROUP_BARRIER 1
# else
// For safety, assume that other SYCL implementations comply with SYCL 2020, which is a oneDPL requirement.
# define _ONEDPL_SYCL121_GROUP_BARRIER 0
# endif
#endif

#if _ONEDPL_SYCL121_GROUP_BARRIER
using __fence_space_t = sycl::access::fence_space;
inline constexpr __fence_space_t __fence_space_local = sycl::access::fence_space::local_space;
inline constexpr __fence_space_t __fence_space_global = sycl::access::fence_space::global_space;
#else
struct __fence_space_t{}; // No-op dummy type since SYCL 2020 does not specify memory fence spaces in group barriers
inline constexpr __fence_space_t __fence_space_local{};
inline constexpr __fence_space_t __fence_space_global{};
#endif // _ONEDPL_SYCL121_GROUP_BARRIER

template <typename _Item>
constexpr void
__group_barrier(_Item __item)
void
__group_barrier(_Item __item, [[maybe_unused]] __dpl_sycl::__fence_space_t __space = __fence_space_local)
{
#if 0 // !defined(_ONEDPL_LIBSYCL_VERSION) || _ONEDPL_LIBSYCL_VERSION >= 50300
//TODO: usage of sycl::group_barrier: probably, we have to revise SYCL parallel patterns which use a group_barrier.
// 1) sycl::group_barrier() implementation is not ready
// 2) sycl::group_barrier and sycl::item::group_barrier are not quite equivalent
#if _ONEDPL_SYCL121_GROUP_BARRIER
__item.barrier(__space);
#elif _ONEDPL_SYCL2020_GROUP_BARRIER_PRESENT
sycl::group_barrier(__item.get_group(), sycl::memory_scope::work_group);
#else
__item.barrier(sycl::access::fence_space::local_space);
# error "sycl::group_barrier is not supported, and no alternative is available"
#endif
}

Expand Down

0 comments on commit 47c5daf

Please sign in to comment.