Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Windows: ARM64/NEON Support #769

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions gen/archs.xml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ at the top, as a last resort.
<arch name="neon">
<flag compiler="gnu">-funsafe-math-optimizations</flag>
<flag compiler="clang">-funsafe-math-optimizations</flag>
<flag compiler="msvc"> </flag>
<alignment>16</alignment>
<check name="neon"></check>
</arch>
Expand All @@ -101,6 +102,7 @@ at the top, as a last resort.
<arch name="neonv8">
<flag compiler="gnu">-funsafe-math-optimizations</flag>
<flag compiler="clang">-funsafe-math-optimizations</flag>
<flag compiler="msvc"> </flag>
<alignment>16</alignment>
<check name="neon"></check>
</arch>
Expand Down
4 changes: 4 additions & 0 deletions kernels/volk/volk_32f_index_max_32u.h
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,11 @@ volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_po
if (maxValuesBuffer[number] > max) {
index = maxIndexesBuffer[number];
max = maxValuesBuffer[number];
#ifdef _MSC_VER
} else if (maxValues.n128_f32[number] == max) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I must admit I don't quite understand that; what type is float32x4_t on MSVC/aarch64/neon?

#else
} else if (maxValues[number] == max) {
#endif
if (index > maxIndexesBuffer[number])
index = maxIndexesBuffer[number];
}
Expand Down
4 changes: 4 additions & 0 deletions kernels/volk/volk_32f_index_min_32u.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,11 @@ volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_
if (minValuesBuffer[number] < min) {
index = minIndexesBuffer[number];
min = minValuesBuffer[number];
#ifdef _MSC_VER
} else if (minValues.n128_f32[number] == min) {
#else
} else if (minValues[number] == min) {
#endif
Comment on lines +287 to +291
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like smth very special. Could you add a comment with some context? Maybe a link that explains the necessary change?

if (index > minIndexesBuffer[number])
index = minIndexesBuffer[number];
}
Expand Down
8 changes: 4 additions & 4 deletions kernels/volk/volk_32fc_accumulator_s32fc.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,10 +229,10 @@ static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result,
lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
unsigned int eighthPoints = num_points / 8;
float32x4_t in_vec;
float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f };
float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f };
float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f };
float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f };
float32x4_t out_vec0 = { 0.f };
float32x4_t out_vec1 = { 0.f };
float32x4_t out_vec2 = { 0.f };
float32x4_t out_vec3 = { 0.f };
Comment on lines 231 to +235
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I fear this affects initialization. Are we sure all values are initialized correctly for all compilers?

__VOLK_ATTR_ALIGNED(32) float tempBuffer[4];

for (; number < eighthPoints; number++) {
Expand Down
2 changes: 1 addition & 1 deletion kernels/volk/volk_32fc_convert_16ic.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
const float32x4_t max_val = vmovq_n_f32(max_val_f);
float32x4_t ret1, ret2, a, b;

int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
int32x4_t toint_a = { 0 }, toint_b = { 0 };
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While you're at it, could you change this to give every variable their own statement?

int16x4_t intInputVal1, intInputVal2;
int16x8_t res;

Expand Down
5 changes: 4 additions & 1 deletion kernels/volk/volk_32u_byteswap.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,10 @@ static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int n
uint32_t* inputPtr = (uint32_t*)intsToSwap;
const unsigned int n8points = num_points / 8;
uint8x16_t input;
uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };

uint8x16_t idx;
const uint8_t idx_data[] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
idx = vld1q_u8(idx_data);

unsigned int number = 0;
for (number = 0; number < n8points; ++number) {
Expand Down
15 changes: 12 additions & 3 deletions kernels/volk/volk_32u_reverse_32u.h
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,9 @@ volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_
const uint32_t* in_ptr = in;
uint32_t* out_ptr = out;

const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
uint8x16_t idx;
const uint8_t idx_data[] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
idx = vld1q_u8(idx_data);
Comment on lines -265 to +267
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why? The data type implies 16 values of the same type. Does MSVC fail to perform the correct aggregate initialization? This indirection may negatively impact optimizations.
I'd like to see this unified in one line. Would that be possible?
Would it be possible to have a godbolt/compiler explorer comparison of this for GCC/Clang vs. MSVC?


const unsigned int quarterPoints = num_points / 4;
unsigned int number = 0;
Expand Down Expand Up @@ -290,8 +292,15 @@ volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_

#ifdef LV_HAVE_NEON
#include <arm_neon.h>

#if defined(__aarch64__)
#ifdef _MSC_VER
#define DO_RBIT \
*out_ptr = _byteswap_ulong(*in_ptr); \
*out_ptr = ((*out_ptr & 0x55555555) << 1) | ((*out_ptr & 0xAAAAAAAA) >> 1); \
*out_ptr = ((*out_ptr & 0x33333333) << 2) | ((*out_ptr & 0xCCCCCCCC) >> 2); \
*out_ptr = ((*out_ptr & 0x0F0F0F0F) << 4) | ((*out_ptr & 0xF0F0F0F0) >> 4); \
in_ptr++; \
out_ptr++;
#elif defined(__aarch64__)
Comment on lines -294 to +303
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This section indicates that we already had special treatment for some conditions. Now, it looks like we exchange them for another set of conditions. This change would warrant a comment on the specifics of different platforms. This will help anyone looking at this in the future.

#define DO_RBIT \
__VOLK_ASM("rbit %w[result], %w[value]" \
: [result] "=r"(*out_ptr) \
Expand Down
20 changes: 12 additions & 8 deletions lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -221,20 +221,24 @@ check_c_source_compiles(

if(neon_compile_result)
set(CMAKE_REQUIRED_INCLUDES ${PROJECT_SOURCE_DIR}/include)
if(MSVC)
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM")
overrule_arch(neonv8 "Compiler doesn't support neonv8")
endif()
Comment on lines +224 to +227
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😢

else(MSVC)
check_c_source_compiles(
"#include <volk/volk_common.h>\n int main(){__VOLK_ASM(\"sub v1.4s,v1.4s,v1.4s\");}"
have_neonv8_result)
if(NOT have_neonv8_result)
overrule_arch(neonv8 "Compiler doesn't support neonv8")
endif()
endif(MSVC)
check_c_source_compiles(
"#include <volk/volk_common.h>\n int main(){__VOLK_ASM(\"vrev32.8 q0, q0\");}"
have_neonv7_result)
check_c_source_compiles(
"#include <volk/volk_common.h>\n int main(){__VOLK_ASM(\"sub v1.4s,v1.4s,v1.4s\");}"
have_neonv8_result)

if(NOT have_neonv7_result)
overrule_arch(neonv7 "Compiler doesn't support neonv7")
endif()

if(NOT have_neonv8_result)
overrule_arch(neonv8 "Compiler doesn't support neonv8")
endif()
else(neon_compile_result)
Comment on lines 238 to 242
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would you want to remove the non-MSVC checks here? Looking at this section makes me think, it might be a good time to unify this section. Just a thought.

overrule_arch(neon "Compiler doesn't support NEON")
overrule_arch(neonv7 "Compiler doesn't support NEON")
Expand Down
Loading