From 2124947599f03f879dec17e1a91cbe3b9fb52492 Mon Sep 17 00:00:00 2001 From: Bruce Merry Date: Wed, 6 Nov 2024 11:58:36 +0200 Subject: [PATCH] Make memcpy_nontempora_sve actually compile --- include/spead2/common_features.h.in | 2 +- src/common_memcpy.cpp | 14 +++++++++----- src/unittest_memcpy.cpp | 3 ++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/include/spead2/common_features.h.in b/include/spead2/common_features.h.in index 113b5bcd..ce0ee4bc 100644 --- a/include/spead2/common_features.h.in +++ b/include/spead2/common_features.h.in @@ -54,7 +54,7 @@ #endif /* Similarly for AARCH64 features */ -#if defined(__AARCH64LE__) || defined(__AARCH64BE__) +#if defined(__aarch64__) # define SPEAD2_USE_SVE_STREAM @SPEAD2_USE_SVE_STREAM@ #else # define SPEAD2_USE_SVE_STREAM 0 diff --git a/src/common_memcpy.cpp b/src/common_memcpy.cpp index 67d263bb..b59701d1 100644 --- a/src/common_memcpy.cpp +++ b/src/common_memcpy.cpp @@ -59,6 +59,7 @@ #endif #if SPEAD2_USE_SVE_STREAM +# include # include # include #endif @@ -68,7 +69,7 @@ namespace spead2 #if SPEAD2_USE_SVE_STREAM [[gnu::target("+sve")]] -static void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept +void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept { /* The AArch64 memory model says * @@ -78,14 +79,17 @@ static void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __res * accesses can be observed in any order by the other observers within the * shareability domain of the memory addresses being accessed." * + * This is probably not an issue in practice, unless the source address + * is obtained with memory_order_consume and the compiler actually tracks + * dependencies (which apparently none do). + * * It's not entirely clear to me whether that's an issue, but it sounds * like SVE non-temporal reads can be served from a load buffer that's not * coherent with other cores' caches. To be on the safe side, I'm adding a - * barrier here. The magic number makes this a read to read/write barrier. + * barrier here. */ - __dmb(13); + std::atomic_thread_fence(std::memory_order_acquire); - */ /* TODO: this is probably sub-optimal, since it doesn't do any unrolling * or alignment. Efficient unrolling probably requires doing separate body * and tail (where the body is a multiple of the vector length) to avoid @@ -133,7 +137,7 @@ void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t) noexcep /* aarch64 options */ #if SPEAD2_USE_SVE_STREAM - unsigned long hwcaps = getauxval(AT_HWCAPS); + unsigned long hwcaps = getauxval(AT_HWCAP); if (hwcaps & HWCAP_SVE) return memcpy_nontemporal_sve; #endif diff --git a/src/unittest_memcpy.cpp b/src/unittest_memcpy.cpp index 376dace7..82df13f0 100644 --- a/src/unittest_memcpy.cpp +++ b/src/unittest_memcpy.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #if SPEAD2_USE_SVE_STREAM # include #endif @@ -81,7 +82,7 @@ static const memcpy_function memcpy_functions[] = { "avx512", spead2::memcpy_nontemporal_avx512, bool(__builtin_cpu_supports("avx512f")) }, #endif #if SPEAD2_USE_SVE_STREAM - { "sve", spead2::memcpy_nontemporal_avx512, getauxval(AT_HWCAP) & HWCAP_SVE }, + { "sve", spead2::memcpy_nontemporal_sve, (getauxval(AT_HWCAP) & HWCAP_SVE) != 0 }, #endif };