From 51c251846ba591453a92480a95a27a3ffe901d4b Mon Sep 17 00:00:00 2001 From: Jamie Vital Date: Sat, 31 Aug 2024 00:20:42 -0400 Subject: [PATCH] Windows: ARM64/NEON Support Signed-off-by: Jamie Vital --- gen/archs.xml | 2 ++ kernels/volk/volk_32f_index_max_32u.h | 4 ++++ kernels/volk/volk_32f_index_min_32u.h | 4 ++++ kernels/volk/volk_32fc_accumulator_s32fc.h | 8 ++++---- kernels/volk/volk_32fc_convert_16ic.h | 2 +- kernels/volk/volk_32u_byteswap.h | 5 ++++- kernels/volk/volk_32u_reverse_32u.h | 15 ++++++++++++--- lib/CMakeLists.txt | 20 ++++++++++++-------- 8 files changed, 43 insertions(+), 17 deletions(-) diff --git a/gen/archs.xml b/gen/archs.xml index 164c7bb43..18c440169 100644 --- a/gen/archs.xml +++ b/gen/archs.xml @@ -85,6 +85,7 @@ at the top, as a last resort. -funsafe-math-optimizations -funsafe-math-optimizations + 16 @@ -101,6 +102,7 @@ at the top, as a last resort. -funsafe-math-optimizations -funsafe-math-optimizations + 16 diff --git a/kernels/volk/volk_32f_index_max_32u.h b/kernels/volk/volk_32f_index_max_32u.h index 86dad0d19..e2c4e3ba1 100644 --- a/kernels/volk/volk_32f_index_max_32u.h +++ b/kernels/volk/volk_32f_index_max_32u.h @@ -299,7 +299,11 @@ volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_po if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; +#ifdef _MSC_VER + } else if (maxValues.n128_f32[number] == max) { +#else } else if (maxValues[number] == max) { +#endif if (index > maxIndexesBuffer[number]) index = maxIndexesBuffer[number]; } diff --git a/kernels/volk/volk_32f_index_min_32u.h b/kernels/volk/volk_32f_index_min_32u.h index 0c8bf8c0a..2ef44e99c 100644 --- a/kernels/volk/volk_32f_index_min_32u.h +++ b/kernels/volk/volk_32f_index_min_32u.h @@ -284,7 +284,11 @@ volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_ if (minValuesBuffer[number] < min) { index = minIndexesBuffer[number]; min = minValuesBuffer[number]; +#ifdef _MSC_VER + } else if (minValues.n128_f32[number] == min) { +#else } else if (minValues[number] == min) { +#endif if (index > minIndexesBuffer[number]) index = minIndexesBuffer[number]; } diff --git a/kernels/volk/volk_32fc_accumulator_s32fc.h b/kernels/volk/volk_32fc_accumulator_s32fc.h index d7267ea64..0bbad7eb5 100644 --- a/kernels/volk/volk_32fc_accumulator_s32fc.h +++ b/kernels/volk/volk_32fc_accumulator_s32fc.h @@ -229,10 +229,10 @@ static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result, lv_32fc_t returnValue = lv_cmake(0.f, 0.f); unsigned int eighthPoints = num_points / 8; float32x4_t in_vec; - float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f }; - float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f }; - float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f }; - float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f }; + float32x4_t out_vec0 = { 0.f }; + float32x4_t out_vec1 = { 0.f }; + float32x4_t out_vec2 = { 0.f }; + float32x4_t out_vec3 = { 0.f }; __VOLK_ATTR_ALIGNED(32) float tempBuffer[4]; for (; number < eighthPoints; number++) { diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h index a38cce64a..38bc07a2a 100644 --- a/kernels/volk/volk_32fc_convert_16ic.h +++ b/kernels/volk/volk_32fc_convert_16ic.h @@ -236,7 +236,7 @@ static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, const float32x4_t max_val = vmovq_n_f32(max_val_f); float32x4_t ret1, ret2, a, b; - int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 }; + int32x4_t toint_a = { 0 }, toint_b = { 0 }; int16x4_t intInputVal1, intInputVal2; int16x8_t res; diff --git a/kernels/volk/volk_32u_byteswap.h b/kernels/volk/volk_32u_byteswap.h index a6ec86f80..51339414c 100644 --- a/kernels/volk/volk_32u_byteswap.h +++ b/kernels/volk/volk_32u_byteswap.h @@ -201,7 +201,10 @@ static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int n uint32_t* inputPtr = (uint32_t*)intsToSwap; const unsigned int n8points = num_points / 8; uint8x16_t input; - uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; + + uint8x16_t idx; + const uint8_t idx_data[] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; + idx = vld1q_u8(idx_data); unsigned int number = 0; for (number = 0; number < n8points; ++number) { diff --git a/kernels/volk/volk_32u_reverse_32u.h b/kernels/volk/volk_32u_reverse_32u.h index 62150ac64..02a4686c7 100644 --- a/kernels/volk/volk_32u_reverse_32u.h +++ b/kernels/volk/volk_32u_reverse_32u.h @@ -262,7 +262,9 @@ volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_ const uint32_t* in_ptr = in; uint32_t* out_ptr = out; - const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; + uint8x16_t idx; + const uint8_t idx_data[] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; + idx = vld1q_u8(idx_data); const unsigned int quarterPoints = num_points / 4; unsigned int number = 0; @@ -290,8 +292,15 @@ volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_ #ifdef LV_HAVE_NEON #include - -#if defined(__aarch64__) +#ifdef _MSC_VER +#define DO_RBIT \ + *out_ptr = _byteswap_ulong(*in_ptr); \ + *out_ptr = ((*out_ptr & 0x55555555) << 1) | ((*out_ptr & 0xAAAAAAAA) >> 1); \ + *out_ptr = ((*out_ptr & 0x33333333) << 2) | ((*out_ptr & 0xCCCCCCCC) >> 2); \ + *out_ptr = ((*out_ptr & 0x0F0F0F0F) << 4) | ((*out_ptr & 0xF0F0F0F0) >> 4); \ + in_ptr++; \ + out_ptr++; +#elif defined(__aarch64__) #define DO_RBIT \ __VOLK_ASM("rbit %w[result], %w[value]" \ : [result] "=r"(*out_ptr) \ diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 2c160b2f2..67350c4ef 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -221,20 +221,24 @@ check_c_source_compiles( if(neon_compile_result) set(CMAKE_REQUIRED_INCLUDES ${PROJECT_SOURCE_DIR}/include) + if(MSVC) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM") + overrule_arch(neonv8 "Compiler doesn't support neonv8") + endif() + else(MSVC) + check_c_source_compiles( + "#include \n int main(){__VOLK_ASM(\"sub v1.4s,v1.4s,v1.4s\");}" + have_neonv8_result) + if(NOT have_neonv8_result) + overrule_arch(neonv8 "Compiler doesn't support neonv8") + endif() + endif(MSVC) check_c_source_compiles( "#include \n int main(){__VOLK_ASM(\"vrev32.8 q0, q0\");}" have_neonv7_result) - check_c_source_compiles( - "#include \n int main(){__VOLK_ASM(\"sub v1.4s,v1.4s,v1.4s\");}" - have_neonv8_result) - if(NOT have_neonv7_result) overrule_arch(neonv7 "Compiler doesn't support neonv7") endif() - - if(NOT have_neonv8_result) - overrule_arch(neonv8 "Compiler doesn't support neonv8") - endif() else(neon_compile_result) overrule_arch(neon "Compiler doesn't support NEON") overrule_arch(neonv7 "Compiler doesn't support NEON")