From 564bb2297059b98110815b55bf5b32495bfc2101 Mon Sep 17 00:00:00 2001 From: Pavel Roskin Date: Mon, 30 Sep 2024 00:19:56 -0700 Subject: [PATCH] Fix support for Intel Compute Runtime with VectorSize > 1 The fallback implementation of amd_bitalign() triggers a bug with Intel Compute Runtime (NEO) versions from 23.22.26516.18 to 24.45.31740.9 inclusive. https://github.com/intel/intel-graphics-compiler/issues/358 The bug affects all but the first component of the vectors, so the self-tests would pass with VectorSize=1. For higher values of VectorSize, including the default VectorSize=2, approximately half of the self-tests fail, all in barrett32 kernels. Add generic_bitalign() that is always implemented using shifts. Use it in all cases when the destination is the same as one of the sources. If Intel Compute Runtime is detected, use 64-bit shifts in generic_bitalign(). For other platforms, keep using 32-bit shifts. Make amd_bitalign() an alias to generic_bitalign() on systems where amd_bitalign() is not available. That way, it would also expand to 64-bit shifts for Intel Compute Runtime. --- src/barrett.cl | 54 +++++++++++++++++++++++++------------------------- src/common.cl | 20 ++++++++++++++++++- 2 files changed, 46 insertions(+), 28 deletions(-) diff --git a/src/barrett.cl b/src/barrett.cl index fcdecd5..bd25991 100644 --- a/src/barrett.cl +++ b/src/barrett.cl @@ -253,27 +253,27 @@ Adding x*x to a few carries will not cascade the carry void shl_96(int96_v * const a) /* shiftleft a one bit */ -{ /* here, bitalign improves the 92-bit kernel, and slows down 76-bit */ +{ /* here, amd_bitalign improves the 92-bit kernel, and slows down 76-bit */ a->d2 = amd_bitalign(a->d2, a->d1, 31); a->d1 = amd_bitalign(a->d1, a->d0, 31); -// a->d2 = (a->d2 << 1) | (a->d1 >> 31); -// a->d1 = (a->d1 << 1) | (a->d0 >> 31); +// a->d2 = generic_bitalign(a->d2, a->d1, 31); +// a->d1 = generic_bitalign(a->d1, a->d0, 31); a->d0 = a->d0 << 1; } void shl_192(int192_v * const a) /* shiftleft a one bit */ -{ /* in this function, bitalign slows down all kernels */ +{ /* in this function, amd_bitalign slows down all kernels */ // a->d5 = amd_bitalign(a->d5, a->d4, 31); // a->d4 = amd_bitalign(a->d4, a->d3, 31); // a->d3 = amd_bitalign(a->d3, a->d2, 31); // a->d2 = amd_bitalign(a->d2, a->d1, 31); // a->d1 = amd_bitalign(a->d1, a->d0, 31); - a->d5 = (a->d5 << 1) | (a->d4 >> 31); - a->d4 = (a->d4 << 1) | (a->d3 >> 31); - a->d3 = (a->d3 << 1) | (a->d2 >> 31); - a->d2 = (a->d2 << 1) | (a->d1 >> 31); - a->d1 = (a->d1 << 1) | (a->d0 >> 31); + a->d5 = generic_bitalign(a->d5, a->d4, 31); + a->d4 = generic_bitalign(a->d4, a->d3, 31); + a->d3 = generic_bitalign(a->d3, a->d2, 31); + a->d2 = generic_bitalign(a->d2, a->d1, 31); + a->d1 = generic_bitalign(a->d1, a->d0, 31); a->d0 = a->d0 << 1; } @@ -442,12 +442,12 @@ void div_192_96(int96_v * const res, __private uint qd5, const int96_v n, const // shiftleft nn 11 bits #ifndef DIV_160_96 - nn.d3 = (nn.d3 << 11) + (nn.d2 >> 21); + nn.d3 = generic_bitalign(nn.d3, nn.d2, 21); #endif nn.d2 = amd_bitalign(nn.d2, nn.d1, 21); nn.d1 = amd_bitalign(nn.d1, nn.d0, 21); -// nn.d2 = (nn.d2 << 11) + (nn.d1 >> 21); -// nn.d1 = (nn.d1 << 11) + (nn.d0 >> 21); +// nn.d2 = generic_bitalign(nn.d2, nn.d1, 21); +// nn.d1 = generic_bitalign(nn.d1, nn.d0, 21); nn.d0 = nn.d0 << 11; // q = q - nn @@ -510,11 +510,11 @@ void div_192_96(int96_v * const res, __private uint qd5, const int96_v n, const nn.d4 = nn.d3 >> 9; #endif // nn.d3 = amd_bitalign(nn.d3, nn.d2, 9); - nn.d3 = (nn.d3 << 23) + (nn.d2 >> 9); + nn.d3 = generic_bitalign(nn.d3, nn.d2, 9); nn.d2 = amd_bitalign(nn.d2, nn.d1, 9); -// nn.d2 = (nn.d2 << 23) + (nn.d1 >> 9); +// nn.d2 = generic_bitalign(nn.d2, nn.d1, 9); // nn.d1 = amd_bitalign(nn.d1, nn.d0, 9); - nn.d1 = (nn.d1 << 23) + (nn.d0 >> 9); + nn.d1 = generic_bitalign(nn.d1, nn.d0, 9); nn.d0 = nn.d0 << 23; // q = q - nn @@ -642,9 +642,9 @@ void div_192_96(int96_v * const res, __private uint qd5, const int96_v n, const #ifdef CHECKS_MODBASECASE nn.d4 = nn.d3 >> 17; #endif - nn.d3 = (nn.d3 << 15) + (nn.d2 >> 17); - nn.d2 = (nn.d2 << 15) + (nn.d1 >> 17); - nn.d1 = (nn.d1 << 15) + (nn.d0 >> 17); + nn.d3 = generic_bitalign(nn.d3, nn.d2, 17); + nn.d2 = generic_bitalign(nn.d2, nn.d1, 17); + nn.d1 = generic_bitalign(nn.d1, nn.d0, 17); nn.d0 = nn.d0 << 15; // q = q - nn @@ -877,12 +877,12 @@ DIV_160_96 here. */ // shiftleft nn 11 bits #ifndef DIV_160_96 - nn.d3 = (nn.d3 << 11) + (nn.d2 >> 21); + nn.d3 = generic_bitalign(nn.d3, nn.d2, 21); #endif nn.d2 = amd_bitalign(nn.d2, nn.d1, 21); nn.d1 = amd_bitalign(nn.d1, nn.d0, 21); -// nn.d2 = (nn.d2 << 11) + (nn.d1 >> 21); -// nn.d1 = (nn.d1 << 11) + (nn.d0 >> 21); +// nn.d2 = generic_bitalign(nn.d2, nn.d1, 21); +// nn.d1 = generic_bitalign(nn.d1, nn.d0, 21); nn.d0 = nn.d0 << 11; // q = q - nn @@ -945,11 +945,11 @@ DIV_160_96 here. */ nn.d4 = nn.d3 >> 9; #endif // nn.d3 = amd_bitalign(nn.d3, nn.d2, 9); - nn.d3 = (nn.d3 << 23) + (nn.d2 >> 9); + nn.d3 = generic_bitalign(nn.d3, nn.d2, 9); nn.d2 = amd_bitalign(nn.d2, nn.d1, 9); -// nn.d2 = (nn.d2 << 23) + (nn.d1 >> 9); +// nn.d2 = generic_bitalign(nn.d2, nn.d1, 9); // nn.d1 = amd_bitalign(nn.d1, nn.d0, 9); - nn.d1 = (nn.d1 << 23) + (nn.d0 >> 9); + nn.d1 = generic_bitalign(nn.d1, nn.d0, 9); nn.d0 = nn.d0 << 23; // q = q - nn @@ -1077,9 +1077,9 @@ DIV_160_96 here. */ #ifdef CHECKS_MODBASECASE nn.d4 = nn.d3 >> 17; #endif - nn.d3 = (nn.d3 << 15) + (nn.d2 >> 17); - nn.d2 = (nn.d2 << 15) + (nn.d1 >> 17); - nn.d1 = (nn.d1 << 15) + (nn.d0 >> 17); + nn.d3 = generic_bitalign(nn.d3, nn.d2, 17); + nn.d2 = generic_bitalign(nn.d2, nn.d1, 17); + nn.d1 = generic_bitalign(nn.d1, nn.d0, 17); nn.d0 = nn.d0 << 15; // q = q - nn diff --git a/src/common.cl b/src/common.cl index 821f77a..97c34f8 100644 --- a/src/common.cl +++ b/src/common.cl @@ -170,6 +170,24 @@ uint popcount(uint x) #define ATOMIC_INC(x) ((x)++) #endif +// generic_bitalign() emulates amd_bitalign() using shifts. +#ifdef cl_intel_subgroups +// Workaround for Intel Compute Runtime (NEO) versions 23.22.26516.18 to +// 24.45.31740.9: https://github.com/intel/intel-graphics-compiler/issues/358 +// Use 64-bit shifts. They are faster than 32-bit shifts on Intel, so it's not +// needed to limit this workaround to specific versions. +inline uint_v generic_bitalign(const uint_v high, const uint_v low, const int shift) +{ + return CONVERT_UINT_V(((CONVERT_ULONG_V(high) << 32) | CONVERT_ULONG_V(low)) >> shift); +} +#else +// Use 32-bit shifts for other platforms. +inline uint_v generic_bitalign(const uint_v high, const uint_v low, const int shift) +{ + return (high << (32 - shift)) | (low >> shift); +} +#endif + #ifdef cl_amd_media_ops #pragma OPENCL EXTENSION cl_amd_media_ops : enable #else @@ -180,7 +198,7 @@ uint popcount(uint x) // Description // dst.s0 = (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2.s0 & 31)) // similar operation applied to other components of the vectors. -#define amd_bitalign(src0, src1, src2) (src0 << (32-src2)) | (src1 >> src2) +#define amd_bitalign(src0, src1, src2) generic_bitalign(src0, src1, src2) #endif #ifdef cl_amd_media_ops2