From fbb565773327e2de04e5c0ee5d3787f47e327443 Mon Sep 17 00:00:00 2001 From: Chun-wei Fan Date: Mon, 9 May 2022 17:40:00 +0800 Subject: [PATCH 1/9] graphene-macros.h: Add macros for one-liner intrinsic calls This way, we can try to abstract uses of such calls between different compilers that we support instead of repeating them in the headers due to differences in compiler syntax/feature support. --- include/graphene-macros.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/graphene-macros.h b/include/graphene-macros.h index 4f2bb38f..33956a4c 100644 --- a/include/graphene-macros.h +++ b/include/graphene-macros.h @@ -111,3 +111,13 @@ typedef int bool; #define GRAPHENE_PI 3.1415926535897932384626434f #define GRAPHENE_PI_2 1.5707963267948966192313217f + +#if defined (__GNUC__) || defined (__clang__) +#define GRAPHENE_ONELINER(expr) (__extension__ ({expr;})) +#define GRAPHENE_ONELINER_WITH_RTYPE(rtype,expr) (__extension__ ({(rtype) expr;})) +#elif defined (_MSC_VER) +#define GRAPHENE_ONELINER(expr) expr +#define GRAPHENE_ONELINER_WITH_RTYPE(rtype,expr) GRAPHENE_ONELINER (expr) +#else +# error Please define macros suitable for your compiler +#endif From 000be8a03a48460b8bfbf1f48b9ce1d359fcfeea Mon Sep 17 00:00:00 2001 From: Chun-wei Fan Date: Mon, 9 May 2022 18:45:11 +0800 Subject: [PATCH 2/9] include/graphene-simd*.h: Reduce duplication in SSE one-liner calls Use the newly-added macros to abstract the one-liner intrinsic calls for GCC/CLang and Visual Studio for building the SSE code, to reduce duplication. It's not totally exhausive, but should cover quite a number of items. --- include/graphene-simd4f.h | 296 ++++++++++-------------------------- include/graphene-simd4x4f.h | 9 +- 2 files changed, 84 insertions(+), 221 deletions(-) diff --git a/include/graphene-simd4f.h b/include/graphene-simd4f.h index ea390621..7e43ae2f 100644 --- a/include/graphene-simd4f.h +++ b/include/graphene-simd4f.h @@ -184,6 +184,86 @@ typedef union { float f[4]; } graphene_simd4f_union_t; +# define graphene_simd4f_init_zero() \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_setzero_ps()) + +# define graphene_simd4f_init_4f(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_loadu_ps (v)) + +# define graphene_simd4f_dup_4f(s,v) \ + GRAPHENE_ONELINER (_mm_storeu_ps ((v), (s))) + +# define graphene_simd4f_dup_3f(s,v) \ + GRAPHENE_ONELINER (memcpy ((v), &(s), sizeof (float) * 3)) + +# define graphene_simd4f_dup_2f(s,v) \ + GRAPHENE_ONELINER (memcpy ((v), &(s), sizeof (float) * 2)) + +# define graphene_simd4f_splat(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_set1_ps ((v))) + +# define graphene_simd4f_splat_x(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (0, 0, 0, 0))) + +# define graphene_simd4f_splat_y(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (1, 1, 1, 1))) + +# define graphene_simd4f_splat_z(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (2, 2, 2, 2))) + +# define graphene_simd4f_splat_w(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (3, 3, 3, 3))) + +# define graphene_simd4f_add(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_add_ps ((a), (b))) + +# define graphene_simd4f_sub(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_sub_ps ((a), (b))) + +# define graphene_simd4f_mul(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_mul_ps ((a), (b))) + +# define graphene_simd4f_div(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_div_ps ((a), (b))) + +# define graphene_simd4f_sqrt(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_sqrt_ps ((v))) + +# if defined(GRAPHENE_USE_SSE4_1) +# define graphene_simd4f_dot3(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_dp_ps ((a), (b), 0x7f)) +# endif + +# define graphene_simd4f_min(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_min_ps (a, b)) + +# define graphene_simd4f_max(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_max_ps (a, b)) + +# define graphene_simd4f_shuffle_wxyz(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_shuffle_ps (v, v, _MM_SHUFFLE (2, 1, 0, 3))) + +# define graphene_simd4f_shuffle_zwxy(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_shuffle_ps (v, v, _MM_SHUFFLE (1, 0, 3, 2))) + +# define graphene_simd4f_shuffle_yzwx(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_shuffle_ps (v, v, _MM_SHUFFLE (0, 3, 2, 1))) + +# define graphene_simd4f_zero_w(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_movelh_ps (v, _mm_unpackhi_ps (v, _mm_setzero_ps ()))) + +# define graphene_simd4f_zero_zw(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_movelh_ps (v, _mm_setzero_ps ())) + +# define graphene_simd4f_merge_w(s,v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_movelh_ps (s, _mm_unpackhi_ps (s, _mm_set1_ps (v)))) + +# define graphene_simd4f_merge_high(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_movehl_ps (b, a)) + +# define graphene_simd4f_merge_low(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_movelh_ps (a, b)) + /* On GCC, we use __extension__ macros to avoid a static inline */ # if defined(__GNUC__) @@ -194,16 +274,6 @@ typedef union { (graphene_simd4f_t) { (x), (y), (z), (w) }; \ })) -# define graphene_simd4f_init_zero() \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_setzero_ps(); \ - })) - -# define graphene_simd4f_init_4f(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_loadu_ps (v); \ - })) - # define graphene_simd4f_init_3f(v) \ (__extension__ ({ \ (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], 0.f }; \ @@ -214,21 +284,6 @@ typedef union { (graphene_simd4f_t) { (v)[0], (v)[1], 0.f, 0.f }; \ })) -# define graphene_simd4f_dup_4f(s,v) \ - (__extension__ ({ \ - _mm_storeu_ps ((v), (s)); \ - })) - -# define graphene_simd4f_dup_3f(s,v) \ - (__extension__ ({ \ - memcpy ((v), &(s), sizeof (float) * 3); \ - })) - -# define graphene_simd4f_dup_2f(s,v) \ - (__extension__ ({ \ - memcpy ((v), &(s), sizeof (float) * 2); \ - })) - # define graphene_simd4f_get(s,i) \ (__extension__ ({ \ graphene_simd4f_union_t __u = { (s) }; \ @@ -240,56 +295,6 @@ typedef union { # define graphene_simd4f_get_z(s) graphene_simd4f_get (s, 2) # define graphene_simd4f_get_w(s) graphene_simd4f_get (s, 3) -# define graphene_simd4f_splat(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_set1_ps ((v)); \ - })) - -# define graphene_simd4f_splat_x(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (0, 0, 0, 0)); \ - })) - -# define graphene_simd4f_splat_y(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (1, 1, 1, 1)); \ - })) - -# define graphene_simd4f_splat_z(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (2, 2, 2, 2)); \ - })) - -# define graphene_simd4f_splat_w(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (3, 3, 3, 3)); \ - })) - -# define graphene_simd4f_add(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_add_ps ((a), (b)); \ - })) - -# define graphene_simd4f_sub(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_sub_ps ((a), (b)); \ - })) - -# define graphene_simd4f_mul(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_mul_ps ((a), (b)); \ - })) - -# define graphene_simd4f_div(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_div_ps ((a), (b)); \ - })) - -# define graphene_simd4f_sqrt(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_sqrt_ps ((v)); \ - })) - # define graphene_simd4f_reciprocal(v) \ (__extension__ ({ \ const graphene_simd4f_t __zero = graphene_simd4f_init (0.0f, 0.0f, 0.0f, 0.0f); \ @@ -320,12 +325,7 @@ typedef union { (graphene_simd4f_t) _mm_sub_ps (_mm_mul_ps (__a_yzx, __b_zxy), _mm_mul_ps (__a_zxy, __b_yzx)); \ })) -# if defined(GRAPHENE_USE_SSE4_1) -# define graphene_simd4f_dot3(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_dp_ps ((a), (b), 0x7f); \ - })) -# else +# if !defined(GRAPHENE_USE_SSE4_1) # define graphene_simd4f_dot3(a,b) \ (__extension__ ({ \ const unsigned int __mask_bits[] GRAPHENE_ALIGN16 = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; \ @@ -345,58 +345,6 @@ typedef union { __res; \ })) -# define graphene_simd4f_min(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_min_ps ((a), (b)); \ - })) - -# define graphene_simd4f_max(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_max_ps ((a), (b)); \ - })) - -# define graphene_simd4f_shuffle_wxyz(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (2, 1, 0, 3)); \ - })) - -# define graphene_simd4f_shuffle_zwxy(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (1, 0, 3, 2)); \ - })) - -# define graphene_simd4f_shuffle_yzwx(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (0, 3, 2, 1)); \ - })) - -# define graphene_simd4f_zero_w(v) \ - (__extension__ ({ \ - graphene_simd4f_t __s = _mm_unpackhi_ps ((v), _mm_setzero_ps ()); \ - (graphene_simd4f_t) _mm_movelh_ps ((v), __s); \ - })) - -# define graphene_simd4f_zero_zw(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_movelh_ps ((v), _mm_setzero_ps ()); \ - })) - -# define graphene_simd4f_merge_w(s,v) \ - (__extension__ ({ \ - graphene_simd4f_t __s = _mm_unpackhi_ps ((s), _mm_set1_ps ((v))); \ - (graphene_simd4f_t) _mm_movelh_ps ((s), __s); \ - })) - -# define graphene_simd4f_merge_high(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_movehl_ps ((b), (a)); \ - })) - -# define graphene_simd4f_merge_low(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) _mm_movelh_ps ((a), (b)); \ - })) - typedef GRAPHENE_ALIGN16 union { unsigned int ui[4]; float f[4]; @@ -485,27 +433,12 @@ _simd4f_init (float x, float y, float z, float w) return __s; } -#define graphene_simd4f_init_zero() \ - _mm_setzero_ps() - -#define graphene_simd4f_init_4f(v) \ - _mm_loadu_ps(v) - #define graphene_simd4f_init_3f(v) \ graphene_simd4f_init (v[0], v[1], v[2], 0.f) #define graphene_simd4f_init_2f(v) \ graphene_simd4f_init (v[0], v[1], 0.f, 0.f) -#define graphene_simd4f_dup_4f(s,v) \ - _mm_storeu_ps (v, s) - -#define graphene_simd4f_dup_3f(s,v) \ - memcpy (v, &s, sizeof (float) * 3) - -#define graphene_simd4f_dup_2f(s,v) \ - memcpy (v, &s, sizeof (float) * 2) - #define graphene_simd4f_get(s,i) _simd4f_get_xyzw(s, i) #define graphene_simd4f_get_x(s) _simd4f_get_xyzw(s, 0) #define graphene_simd4f_get_y(s) _simd4f_get_xyzw(s, 1) @@ -525,36 +458,6 @@ _simd4f_get_xyzw (graphene_simd4f_t s, int mode) return u.f[mode]; } -#define graphene_simd4f_splat(v) \ - _mm_set1_ps (v) - -#define graphene_simd4f_splat_x(v) \ - _mm_shuffle_ps (v, v, _MM_SHUFFLE (0, 0, 0, 0)) - -#define graphene_simd4f_splat_y(v) \ - _mm_shuffle_ps (v, v, _MM_SHUFFLE (1, 1, 1, 1)) - -#define graphene_simd4f_splat_z(v) \ - _mm_shuffle_ps (v, v, _MM_SHUFFLE (2, 2, 2, 2)) - -#define graphene_simd4f_splat_w(v) \ - _mm_shuffle_ps (v, v, _MM_SHUFFLE (3, 3, 3, 3)) - -#define graphene_simd4f_add(a,b) \ - _mm_add_ps (a, b) - -#define graphene_simd4f_sub(a,b) \ - _mm_sub_ps (a, b) - -#define graphene_simd4f_mul(a,b) \ - _mm_mul_ps (a, b) - -#define graphene_simd4f_div(a,b) \ - _mm_div_ps (a, b) - -#define graphene_simd4f_sqrt(v) \ - _mm_sqrt_ps (v) - #define graphene_simd4f_reciprocal(v) _simd4f_reciprocal(v) static inline graphene_simd4f_t @@ -597,6 +500,7 @@ _simd4f_cross3 (const graphene_simd4f_t a, return _mm_sub_ps (_mm_mul_ps (__a_yzx, __b_zxy), _mm_mul_ps (__a_zxy, __b_yzx)); } +#if !defined(GRAPHENE_USE_SSE4_1) #define graphene_simd4f_dot3(a,b) \ _simd4f_dot3(a,b) @@ -604,9 +508,6 @@ static inline graphene_simd4f_t _simd4f_dot3 (const graphene_simd4f_t a, const graphene_simd4f_t b) { -#if defined(GRAPHENE_USE_SSE4_1) - return _mm_dp_ps (a, b, 0x7f); -#else GRAPHENE_ALIGN16 const unsigned int __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; const graphene_simd4f_t __mask = _mm_load_ps ((const float *) __mask_bits); const graphene_simd4f_t __m = _mm_mul_ps ((a), (b)); @@ -615,8 +516,8 @@ _simd4f_dot3 (const graphene_simd4f_t a, const graphene_simd4f_t __s2 = _mm_add_ss (__s1, _mm_shuffle_ps (__s1, __s1, 1)); return _mm_shuffle_ps (__s2, __s2, 0); -#endif } +#endif #define graphene_simd4f_dot3_scalar(a,b) \ _simd4f_dot3_scalar(a,b) @@ -630,37 +531,6 @@ _simd4f_dot3_scalar (const graphene_simd4f_t a, return __res; } -#define graphene_simd4f_min(a,b) \ - _mm_min_ps (a, b) - -#define graphene_simd4f_max(a,b) \ - _mm_max_ps (a, b) - - -#define graphene_simd4f_shuffle_wxyz(v) \ - _mm_shuffle_ps (v, v, _MM_SHUFFLE (2, 1, 0, 3)) - -#define graphene_simd4f_shuffle_zwxy(v) \ - _mm_shuffle_ps (v, v, _MM_SHUFFLE (1, 0, 3, 2)) - -#define graphene_simd4f_shuffle_yzwx(v) \ - _mm_shuffle_ps (v, v, _MM_SHUFFLE (0, 3, 2, 1)) - -#define graphene_simd4f_zero_w(v) \ - _mm_movelh_ps (v, _mm_unpackhi_ps (v, _mm_setzero_ps ())) - -#define graphene_simd4f_zero_zw(v) \ - _mm_movelh_ps (v, _mm_setzero_ps ()) - -#define graphene_simd4f_merge_w(s,v) \ - _mm_movelh_ps (s, _mm_unpackhi_ps (s, _mm_set1_ps (v))) - -#define graphene_simd4f_merge_high(a,b) \ - _mm_movehl_ps (b, a) - -#define graphene_simd4f_merge_low(a,b) \ - _mm_movelh_ps (a, b) - typedef GRAPHENE_ALIGN16 union { unsigned int ui[4]; float f[4]; diff --git a/include/graphene-simd4x4f.h b/include/graphene-simd4x4f.h index 69d4e8f2..a02e3eeb 100644 --- a/include/graphene-simd4x4f.h +++ b/include/graphene-simd4x4f.h @@ -147,15 +147,8 @@ void graphene_simd4x4f_transpose_in_place (graphene_simd4x4f_t *s); #if defined(GRAPHENE_USE_SSE) -#ifdef __GNUC__ #define graphene_simd4x4f_transpose_in_place(s) \ - (__extension__ ({ \ - _MM_TRANSPOSE4_PS ((s)->x, (s)->y, (s)->z, (s)->w); \ - })) -#elif defined (_MSC_VER) -#define graphene_simd4x4f_transpose_in_place(s) \ - _MM_TRANSPOSE4_PS ((s)->x, (s)->y, (s)->z, (s)->w) -#endif + GRAPHENE_ONELINER(_MM_TRANSPOSE4_PS ((s)->x, (s)->y, (s)->z, (s)->w)) #elif defined(GRAPHENE_USE_INTRINSICS) From 4bcde643df1e20ad5d27d9094cb57d8d7d4291cd Mon Sep 17 00:00:00 2001 From: Chun-wei Fan Date: Mon, 9 May 2022 19:25:52 +0800 Subject: [PATCH 3/9] include/graphene-simd*.h: Reduce duplication in ARM NEON one-liner calls Use the newly-added macros to abstract one-liner intrinsic calls for GCC/CLang and Visual Studio for building the ARM NEON code, to reduce duplication. It's not totally exhausive, but should cover quite a number of items. --- include/graphene-simd4f.h | 173 +++++++++++--------------------------- 1 file changed, 51 insertions(+), 122 deletions(-) diff --git a/include/graphene-simd4f.h b/include/graphene-simd4f.h index 7e43ae2f..832e0d7a 100644 --- a/include/graphene-simd4f.h +++ b/include/graphene-simd4f.h @@ -959,6 +959,57 @@ typedef union { /* NEON has optimised 2-lanes vectors we can use */ typedef float32x2_t graphene_simd2f_t; +# define graphene_simd4f_init_zero() \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, vdupq_n_f32 (0.f)) + +# define graphene_simd4f_dup_4f(s,v) \ + GRAPHENE_ONELINER (vst1q_f32 ((float32_t *) (v), (s))) + +# define graphene_simd4f_init_4f(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, vld1q_f32 ((const float32_t *) (v))) + +# define graphene_simd4f_dup_2f(s,v) \ + GRAPHENE_ONELINER (vst1_f32 ((float32_t *) (v), vget_low_f32 ((s)))) + +# define graphene_simd4f_get(s,i) \ + GRAPHENE_ONELINER_WITH_RTYPE (float,vgetq_lane_f32 ((s), (i))) + +# define graphene_simd4f_splat(v) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, vdupq_n_f32 ((v))) + +# define graphene_simd4f_splat_x(s) \ + GRAPHENE_ONELINER (graphene_simd4f_splat (graphene_simd4f_get_x ((s)))) + +# define graphene_simd4f_splat_y(s) \ + GRAPHENE_ONELINER (graphene_simd4f_splat (graphene_simd4f_get_y ((s)))) + +# define graphene_simd4f_splat_z(s) \ + GRAPHENE_ONELINER (graphene_simd4f_splat (graphene_simd4f_get_z ((s)))) + +# define graphene_simd4f_splat_w(s) \ + GRAPHENE_ONELINER (graphene_simd4f_splat (graphene_simd4f_get_w ((s)))) + +# define graphene_simd4f_add(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, vaddq_f32 ((a), (b))) + +# define graphene_simd4f_sub(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, vsubq_f32 ((a), (b))) + +# define graphene_simd4f_mul(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, vmulq_f32 ((a), (b))) + +# define graphene_simd4f_div(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, vmulq_f32 ((a), graphene_simd4f_reciprocal ((b)))) + +# define graphene_simd4f_dot3(a,b) \ + GRAPHENE_ONELINER (graphene_simd4f_splat (graphene_simd4f_dot3_scalar (a, b))) + +# define graphene_simd4f_min(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, vminq_f32 ((a), (b))) + +# define graphene_simd4f_max(a,b) \ + GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, vmaxq_f32 (a, b)) + #ifdef __GNUC__ # define graphene_simd4f_init(x,y,z,w) \ (__extension__ ({ \ @@ -966,17 +1017,6 @@ typedef float32x2_t graphene_simd2f_t; (graphene_simd4f_t) vld1q_f32 (__v); \ })) -# define graphene_simd4f_init_zero() \ - (__extension__ ({ \ - (graphene_simd4f_t) vdupq_n_f32 (0.f); \ - })) - -# define graphene_simd4f_init_4f(v) \ - (__extension__ ({ \ - const float32_t *__v32 = (const float32_t *) (v); \ - (graphene_simd4f_t) vld1q_f32 (__v32); \ - })) - # define graphene_simd4f_init_3f(v) \ (__extension__ ({ \ graphene_simd4f_init (v[0], v[1], v[2], 0.f); \ @@ -991,11 +1031,6 @@ typedef float32x2_t graphene_simd2f_t; (graphene_simd4f_t) vcombine_f32 (__low, __high); \ })) -# define graphene_simd4f_dup_4f(s,v) \ - (__extension__ ({ \ - vst1q_f32 ((float32_t *) (v), (s)); \ - })) - # define graphene_simd4f_dup_3f(s,v) \ (__extension__ ({ \ float *__v = (v); \ @@ -1004,42 +1039,6 @@ typedef float32x2_t graphene_simd2f_t; vst1q_lane_f32 (__v, (s), 2); \ })) -# define graphene_simd4f_dup_2f(s,v) \ - (__extension__ ({ \ - const graphene_simd2f_t __low = vget_low_f32 ((s)); \ - vst1_f32 ((float32_t *) (v), __low); \ - })) - -# define graphene_simd4f_get(s,i) \ - (__extension__ ({ \ - (float) vgetq_lane_f32 ((s), (i)); \ - })) - -# define graphene_simd4f_splat(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) vdupq_n_f32 ((v)); \ - })) - -# define graphene_simd4f_splat_x(s) \ - (__extension__ ({ \ - graphene_simd4f_splat (graphene_simd4f_get_x ((s))); \ - })) - -# define graphene_simd4f_splat_y(s) \ - (__extension__ ({ \ - graphene_simd4f_splat (graphene_simd4f_get_y ((s))); \ - })) - -# define graphene_simd4f_splat_z(s) \ - (__extension__ ({ \ - graphene_simd4f_splat (graphene_simd4f_get_z ((s))); \ - })) - -# define graphene_simd4f_splat_w(s) \ - (__extension__ ({ \ - graphene_simd4f_splat (graphene_simd4f_get_w ((s))); \ - })) - # define graphene_simd4f_reciprocal(s) \ (__extension__ ({ \ graphene_simd4f_t __est = vrecpeq_f32 ((s)); \ @@ -1047,27 +1046,6 @@ typedef float32x2_t graphene_simd2f_t; (graphene_simd4f_t) vmulq_f32 (vrecpsq_f32 (__est, (s)), __est); \ })) -# define graphene_simd4f_add(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) vaddq_f32 ((a), (b)); \ - })) - -# define graphene_simd4f_sub(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) vsubq_f32 ((a), (b)); \ - })) - -# define graphene_simd4f_mul(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) vmulq_f32 ((a), (b)); \ - })) - -# define graphene_simd4f_div(a,b) \ - (__extension__ ({ \ - graphene_simd4f_t __rec = graphene_simd4f_reciprocal ((b)); \ - (graphene_simd4f_t) vmulq_f32 ((a), __rec); \ - })) - # define _simd4f_rsqrt_iter(v,estimate) \ (__extension__ ({ \ const graphene_simd4f_t __est1 = vmulq_f32 ((estimate), (v)); \ @@ -1106,11 +1084,6 @@ typedef float32x2_t graphene_simd2f_t; (graphene_simd4f_t) vandq_s32 ((int32x4_t) __s3, __mask); \ })) -# define graphene_simd4f_dot3(a,b) \ - (__extension__ ({ \ - graphene_simd4f_splat (graphene_simd4f_dot3_scalar (a, b)); \ - })) - # define graphene_simd4f_dot3_scalar(a,b) \ (__extension__ ({ \ const graphene_simd4f_t __m = graphene_simd4f_mul (a, b); \ @@ -1118,16 +1091,6 @@ typedef float32x2_t graphene_simd2f_t; (float) vget_lane_f32 (vadd_f32 (__s1, vget_high_f32 (__m)), 0); \ })) -# define graphene_simd4f_min(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) vminq_f32 ((a), (b)); \ - })) - -# define graphene_simd4f_max(a,b) \ - (__extension__ ({ \ - (graphene_simd4f_t) vmaxq_f32 (a, b); \ - })) - # define graphene_simd4f_shuffle_wxyz(v) \ (__extension__ ({ \ graphene_simd4f_union_t __u = { (v) }; \ @@ -1278,10 +1241,6 @@ _simd4f_init (float x, float y, float z, float w) return vld1q_f32 (__v); } -# define graphene_simd4f_init_zero() vdupq_n_f32 (0.f) - -# define graphene_simd4f_init_4f(v) vld1q_f32 (v) - # define graphene_simd4f_init_3f(v) graphene_simd4f_init (v[0], v[1], v[2], 0.f) # define graphene_simd4f_init_2f(v) _simd4f_init_2f(v) @@ -1295,8 +1254,6 @@ _simd4f_init_2f (const float *v) return vcombine_f32 (__low, __high); } -# define graphene_simd4f_dup_4f(s,v) vst1q_f32 ((float32_t *) (v), (s)) - # define graphene_simd4f_dup_3f(s,v) _simd4f_dup_3f(s,v) static inline void _simd4f_dup_3f (const graphene_simd4f_t s, @@ -1308,20 +1265,6 @@ void _simd4f_dup_3f (const graphene_simd4f_t s, vst1q_lane_f32 (__v, (s), 2); } -# define graphene_simd4f_dup_2f(s,v) vst1_f32 (v, vget_low_f32 (s)) - -# define graphene_simd4f_get(s,i) vgetq_lane_f32 ((s), (i)) - -# define graphene_simd4f_splat(v) vdupq_n_f32 ((v)) - -# define graphene_simd4f_splat_x(s) graphene_simd4f_splat (graphene_simd4f_get_x ((s))) - -# define graphene_simd4f_splat_y(s) graphene_simd4f_splat (graphene_simd4f_get_y ((s))) - -# define graphene_simd4f_splat_z(s) graphene_simd4f_splat (graphene_simd4f_get_z ((s))) - -# define graphene_simd4f_splat_w(s) graphene_simd4f_splat (graphene_simd4f_get_w ((s))) - # define graphene_simd4f_reciprocal(s) _simd4f_reciprocal(s) static inline graphene_simd4f_t _simd4f_reciprocal (const graphene_simd4f_t s) @@ -1331,14 +1274,6 @@ _simd4f_reciprocal (const graphene_simd4f_t s) return vmulq_f32 (vrecpsq_f32 (__est, (s)), __est); } -# define graphene_simd4f_add(a,b) vaddq_f32 ((a), (b)) - -# define graphene_simd4f_sub(a,b) vsubq_f32 ((a), (b)) - -# define graphene_simd4f_mul(a,b) vmulq_f32 ((a), (b)) - -# define graphene_simd4f_div(a,b) vmulq_f32 (a, graphene_simd4f_reciprocal (b)) - static inline graphene_simd4f_t _simd4f_rsqrt_iter (const graphene_simd4f_t v, const graphene_simd4f_t estimate) @@ -1386,8 +1321,6 @@ _simd4f_cross3 (const graphene_simd4f_t a, return vandq_s32 (__s3, __mask); } -# define graphene_simd4f_dot3(a,b) graphene_simd4f_splat (graphene_simd4f_dot3_scalar (a, b)) - # define graphene_simd4f_dot3_scalar(a,b) _simd4f_dot3_scalar(a,b) static inline float _simd4f_dot3_scalar (const graphene_simd4f_t a, @@ -1398,10 +1331,6 @@ _simd4f_dot3_scalar (const graphene_simd4f_t a, return vget_lane_f32 (vadd_f32 (__s1, vget_high_f32 (__m)), 0); } -# define graphene_simd4f_min(a,b) vminq_f32 ((a), (b)) - -# define graphene_simd4f_max(a,b) vmaxq_f32 (a, b) - # define graphene_simd4f_shuffle_wxyz(v) _simd4f_shuffle_wxyz(v) static inline graphene_simd4f_t _simd4f_shuffle_wxyz (const graphene_simd4f_t v) From 5a027236805595aac697432ed281d44bc73d3bb4 Mon Sep 17 00:00:00 2001 From: Chun-wei Fan Date: Tue, 10 May 2022 11:13:51 +0800 Subject: [PATCH 4/9] graphene-macros.h: Add macro to initialize SIMD data arrays These macros can be used to abstract initializing SIMD data arrays for the different compilers that we support, especially as we already require C99 support for building and using Graphene. --- include/graphene-macros.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/graphene-macros.h b/include/graphene-macros.h index 33956a4c..09ea6a2c 100644 --- a/include/graphene-macros.h +++ b/include/graphene-macros.h @@ -115,9 +115,11 @@ typedef int bool; #if defined (__GNUC__) || defined (__clang__) #define GRAPHENE_ONELINER(expr) (__extension__ ({expr;})) #define GRAPHENE_ONELINER_WITH_RTYPE(rtype,expr) (__extension__ ({(rtype) expr;})) +#define GRAPHENE_ONELINER_4ARG_ARRAY_WITH_RTYPE(rtype,v0,v1,v2,v3) (__extension__ ({(rtype){v0, v1, v2, v3};})) #elif defined (_MSC_VER) #define GRAPHENE_ONELINER(expr) expr #define GRAPHENE_ONELINER_WITH_RTYPE(rtype,expr) GRAPHENE_ONELINER (expr) +#define GRAPHENE_ONELINER_4ARG_ARRAY_WITH_RTYPE(rtype,v0,v1,v2,v3) (rtype){v0, v1, v2, v3} #else # error Please define macros suitable for your compiler #endif From 4fb484b05e7208e478f60e5c7475520aecc0676e Mon Sep 17 00:00:00 2001 From: Chun-wei Fan Date: Tue, 10 May 2022 11:16:13 +0800 Subject: [PATCH 5/9] graphene-simd4f.h: Simplify initializing SSE SIMD data arrays Use the macros that we just added to initialize the graphene_simd4f_t arrays with the floats that we pass into graphene_simd4f_init*() as applicable. This especially simplifies the code for Visual Studio since we already require C99 support for building and using Graphene, and we can reduce some code duplication. --- include/graphene-simd4f.h | 40 +++++++++------------------------------ 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/include/graphene-simd4f.h b/include/graphene-simd4f.h index 832e0d7a..346b2aef 100644 --- a/include/graphene-simd4f.h +++ b/include/graphene-simd4f.h @@ -184,12 +184,21 @@ typedef union { float f[4]; } graphene_simd4f_union_t; +#define graphene_simd4f_init(x,y,z,w) \ + GRAPHENE_ONELINER_4ARG_ARRAY_WITH_RTYPE(graphene_simd4f_t, x, y, z, w) + # define graphene_simd4f_init_zero() \ GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_setzero_ps()) # define graphene_simd4f_init_4f(v) \ GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_loadu_ps (v)) +#define graphene_simd4f_init_3f(v) \ + GRAPHENE_ONELINER_4ARG_ARRAY_WITH_RTYPE(graphene_simd4f_t, v[0], v[1], v[2], 0.f) + +#define graphene_simd4f_init_2f(v) \ + GRAPHENE_ONELINER_4ARG_ARRAY_WITH_RTYPE(graphene_simd4f_t, v[0], v[1], 0.f, 0.f) + # define graphene_simd4f_dup_4f(s,v) \ GRAPHENE_ONELINER (_mm_storeu_ps ((v), (s))) @@ -268,22 +277,6 @@ typedef union { # if defined(__GNUC__) /* Use GCC statement __extension__ to inline all these functions */ - -# define graphene_simd4f_init(x,y,z,w) \ - (__extension__ ({ \ - (graphene_simd4f_t) { (x), (y), (z), (w) }; \ - })) - -# define graphene_simd4f_init_3f(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], 0.f }; \ - })) - -# define graphene_simd4f_init_2f(v) \ - (__extension__ ({ \ - (graphene_simd4f_t) { (v)[0], (v)[1], 0.f, 0.f }; \ - })) - # define graphene_simd4f_get(s,i) \ (__extension__ ({ \ graphene_simd4f_union_t __u = { (s) }; \ @@ -424,21 +417,6 @@ typedef GRAPHENE_ALIGN16 union { /* Use static inline to inline all these functions */ -#define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w) - -static inline graphene_simd4f_t -_simd4f_init (float x, float y, float z, float w) -{ - graphene_simd4f_t __s = { x, y, z, w }; - return __s; -} - -#define graphene_simd4f_init_3f(v) \ - graphene_simd4f_init (v[0], v[1], v[2], 0.f) - -#define graphene_simd4f_init_2f(v) \ - graphene_simd4f_init (v[0], v[1], 0.f, 0.f) - #define graphene_simd4f_get(s,i) _simd4f_get_xyzw(s, i) #define graphene_simd4f_get_x(s) _simd4f_get_xyzw(s, 0) #define graphene_simd4f_get_y(s) _simd4f_get_xyzw(s, 1) From b0442e1b4750651c1f056cc93f8993f102f31b6c Mon Sep 17 00:00:00 2001 From: Chun-wei Fan Date: Wed, 11 May 2022 10:28:39 +0800 Subject: [PATCH 6/9] graphene-simd4f.h: Typedef graphene_simd4f_uif_t only once We don't need identical typedefs separate for GCC/CLang and Visual Studio. Put them in one place for all cases. --- include/graphene-simd4f.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/include/graphene-simd4f.h b/include/graphene-simd4f.h index 346b2aef..11a0ec98 100644 --- a/include/graphene-simd4f.h +++ b/include/graphene-simd4f.h @@ -184,6 +184,11 @@ typedef union { float f[4]; } graphene_simd4f_union_t; +typedef GRAPHENE_ALIGN16 union { + unsigned int ui[4]; + float f[4]; +} graphene_simd4f_uif_t; + #define graphene_simd4f_init(x,y,z,w) \ GRAPHENE_ONELINER_4ARG_ARRAY_WITH_RTYPE(graphene_simd4f_t, x, y, z, w) @@ -338,11 +343,6 @@ typedef union { __res; \ })) -typedef GRAPHENE_ALIGN16 union { - unsigned int ui[4]; - float f[4]; -} graphene_simd4f_uif_t; - # define graphene_simd4f_flip_sign_0101(v) \ (__extension__ ({ \ const graphene_simd4f_uif_t __pnpn = { { \ @@ -509,11 +509,6 @@ _simd4f_dot3_scalar (const graphene_simd4f_t a, return __res; } -typedef GRAPHENE_ALIGN16 union { - unsigned int ui[4]; - float f[4]; -} graphene_simd4f_uif_t; - #define graphene_simd4f_flip_sign_0101(v) _simd4f_flip_sign_0101(v) static inline graphene_simd4f_t From 87ed5e7bed93f6c61a9c53498ea119b875b92b64 Mon Sep 17 00:00:00 2001 From: Chun-wei Fan Date: Wed, 11 May 2022 10:37:06 +0800 Subject: [PATCH 7/9] graphene-simd4f.h: Consolidate graphene_simd4f_get_*() for SSE We can define them instead to call the respective graphene_simd4f_get() accordingly instead. --- include/graphene-simd4f.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/include/graphene-simd4f.h b/include/graphene-simd4f.h index 11a0ec98..1bd8b4f9 100644 --- a/include/graphene-simd4f.h +++ b/include/graphene-simd4f.h @@ -213,6 +213,12 @@ typedef GRAPHENE_ALIGN16 union { # define graphene_simd4f_dup_2f(s,v) \ GRAPHENE_ONELINER (memcpy ((v), &(s), sizeof (float) * 2)) + +# define graphene_simd4f_get_x(s) graphene_simd4f_get (s, 0) +# define graphene_simd4f_get_y(s) graphene_simd4f_get (s, 1) +# define graphene_simd4f_get_z(s) graphene_simd4f_get (s, 2) +# define graphene_simd4f_get_w(s) graphene_simd4f_get (s, 3) + # define graphene_simd4f_splat(v) \ GRAPHENE_ONELINER_WITH_RTYPE (graphene_simd4f_t, _mm_set1_ps ((v))) @@ -288,11 +294,6 @@ typedef GRAPHENE_ALIGN16 union { (float) __u.f[(i)]; \ })) -# define graphene_simd4f_get_x(s) graphene_simd4f_get (s, 0) -# define graphene_simd4f_get_y(s) graphene_simd4f_get (s, 1) -# define graphene_simd4f_get_z(s) graphene_simd4f_get (s, 2) -# define graphene_simd4f_get_w(s) graphene_simd4f_get (s, 3) - # define graphene_simd4f_reciprocal(v) \ (__extension__ ({ \ const graphene_simd4f_t __zero = graphene_simd4f_init (0.0f, 0.0f, 0.0f, 0.0f); \ @@ -418,10 +419,6 @@ typedef GRAPHENE_ALIGN16 union { /* Use static inline to inline all these functions */ #define graphene_simd4f_get(s,i) _simd4f_get_xyzw(s, i) -#define graphene_simd4f_get_x(s) _simd4f_get_xyzw(s, 0) -#define graphene_simd4f_get_y(s) _simd4f_get_xyzw(s, 1) -#define graphene_simd4f_get_z(s) _simd4f_get_xyzw(s, 2) -#define graphene_simd4f_get_w(s) _simd4f_get_xyzw(s, 3) static inline float _simd4f_get_xyzw (graphene_simd4f_t s, int mode) From 14e30be71f2c5a254498ab5b8ca24fc4e5c9bf25 Mon Sep 17 00:00:00 2001 From: Chun-wei Fan Date: Wed, 11 May 2022 10:44:30 +0800 Subject: [PATCH 8/9] graphene-simd4f.h: Prefix MSVC implementation of SSE SIMD calls ...with graphene_msvc_ instead of just _. This attempts to make things clearer to people. --- include/graphene-simd4f.h | 77 +++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/include/graphene-simd4f.h b/include/graphene-simd4f.h index 1bd8b4f9..9f69dc17 100644 --- a/include/graphene-simd4f.h +++ b/include/graphene-simd4f.h @@ -418,10 +418,10 @@ typedef GRAPHENE_ALIGN16 union { /* Use static inline to inline all these functions */ -#define graphene_simd4f_get(s,i) _simd4f_get_xyzw(s, i) +#define graphene_simd4f_get(s,i) graphene_msvc_simd4f_get_xyzw(s, i) static inline float -_simd4f_get_xyzw (graphene_simd4f_t s, int mode) +graphene_msvc_simd4f_get_xyzw (graphene_simd4f_t s, int mode) { /* mode: get_x=0 get_y=1 @@ -433,10 +433,10 @@ _simd4f_get_xyzw (graphene_simd4f_t s, int mode) return u.f[mode]; } -#define graphene_simd4f_reciprocal(v) _simd4f_reciprocal(v) +#define graphene_simd4f_reciprocal(v) graphene_msvc_simd4f_reciprocal(v) static inline graphene_simd4f_t -_simd4f_reciprocal(const graphene_simd4f_t v) +graphene_msvc_simd4f_reciprocal(const graphene_simd4f_t v) { const graphene_simd4f_t __zero = graphene_simd4f_init (0.0f, 0.0f, 0.0f, 0.0f); const graphene_simd4f_t __two = graphene_simd4f_init (2.0f, 2.0f, 2.0f, 2.0f); @@ -447,10 +447,10 @@ _simd4f_reciprocal(const graphene_simd4f_t v) return graphene_simd4f_mul (__s, graphene_simd4f_sub (__two, __m)); } -#define graphene_simd4f_rsqrt(v) _simd4f_rsqrt(v) +#define graphene_simd4f_rsqrt(v) graphene_msvc_simd4f_rsqrt(v) static inline graphene_simd4f_t -_simd4f_rsqrt(const graphene_simd4f_t v) +graphene_msvc_simd4f_rsqrt(const graphene_simd4f_t v) { const graphene_simd4f_t __half = graphene_simd4f_init (0.5f, 0.5f, 0.5f, 0.5f); const graphene_simd4f_t __three = graphene_simd4f_init (3.0f, 3.0f, 3.0f, 3.0f); @@ -460,12 +460,11 @@ _simd4f_rsqrt(const graphene_simd4f_t v) graphene_simd4f_mul (__s, graphene_simd4f_mul (v, __s)))); } -#define graphene_simd4f_cross3(a,b) \ - _simd4f_cross3(a,b) +#define graphene_simd4f_cross3(a,b) graphene_msvc_simd4f_cross3(a,b) static inline graphene_simd4f_t -_simd4f_cross3 (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_cross3 (const graphene_simd4f_t a, + const graphene_simd4f_t b) { const graphene_simd4f_t __a_yzx = _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 0, 2, 1)); const graphene_simd4f_t __a_zxy = _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 1, 0, 2)); @@ -476,12 +475,11 @@ _simd4f_cross3 (const graphene_simd4f_t a, } #if !defined(GRAPHENE_USE_SSE4_1) -#define graphene_simd4f_dot3(a,b) \ - _simd4f_dot3(a,b) +#define graphene_simd4f_dot3(a,b) graphene_msvc_simd4f_dot3(a,b) static inline graphene_simd4f_t -_simd4f_dot3 (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_dot3 (const graphene_simd4f_t a, + const graphene_simd4f_t b) { GRAPHENE_ALIGN16 const unsigned int __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; const graphene_simd4f_t __mask = _mm_load_ps ((const float *) __mask_bits); @@ -494,22 +492,21 @@ _simd4f_dot3 (const graphene_simd4f_t a, } #endif -#define graphene_simd4f_dot3_scalar(a,b) \ - _simd4f_dot3_scalar(a,b) +#define graphene_simd4f_dot3_scalar(a,b) graphene_msvc_simd4f_dot3_scalar(a,b) static inline float -_simd4f_dot3_scalar (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_dot3_scalar (const graphene_simd4f_t a, + const graphene_simd4f_t b) { float __res; _mm_store_ss (&__res, graphene_simd4f_dot3 (a, b)); return __res; } -#define graphene_simd4f_flip_sign_0101(v) _simd4f_flip_sign_0101(v) +#define graphene_simd4f_flip_sign_0101(v) graphene_msvc_simd4f_flip_sign_0101(v) static inline graphene_simd4f_t -_simd4f_flip_sign_0101 (const graphene_simd4f_t v) +graphene_msvc_simd4f_flip_sign_0101 (const graphene_simd4f_t v) { const graphene_simd4f_uif_t __pnpn = { { 0x00000000, @@ -521,10 +518,10 @@ _simd4f_flip_sign_0101 (const graphene_simd4f_t v) return _mm_xor_ps (v, _mm_load_ps (__pnpn.f)); } -#define graphene_simd4f_flip_sign_1010(v) _simd4f_flip_sign_1010(v) +#define graphene_simd4f_flip_sign_1010(v) graphene_msvc_simd4f_flip_sign_1010(v) static inline graphene_simd4f_t -_simd4f_flip_sign_1010(const graphene_simd4f_t v) +graphene_msvc_simd4f_flip_sign_1010(const graphene_simd4f_t v) { const graphene_simd4f_uif_t __npnp = { { 0x80000000, @@ -536,70 +533,70 @@ _simd4f_flip_sign_1010(const graphene_simd4f_t v) return _mm_xor_ps (v, _mm_load_ps (__npnp.f)); } -#define graphene_simd4f_cmp_eq(a,b) _simd4f_cmp_eq(a,b) +#define graphene_simd4f_cmp_eq(a,b) graphene_msvc_simd4f_cmp_eq(a,b) static inline bool -_simd4f_cmp_eq (const graphene_simd4f_t a, +graphene_msvc_simd4f_cmp_eq (const graphene_simd4f_t a, const graphene_simd4f_t b) { __m128i __res = _mm_castps_si128 (_mm_cmpneq_ps (a, b)); return (_mm_movemask_epi8 (__res) == 0); } -#define graphene_simd4f_cmp_neq(a,b) _simd4f_cmp_neq(a,b) +#define graphene_simd4f_cmp_neq(a,b) graphene_msvc_simd4f_cmp_neq(a,b) static inline bool -_simd4f_cmp_neq (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_cmp_neq (const graphene_simd4f_t a, + const graphene_simd4f_t b) { __m128i __res = _mm_castps_si128 (_mm_cmpneq_ps (a, b)); return (_mm_movemask_epi8 (__res) != 0); } -#define graphene_simd4f_cmp_lt(a,b) _simd4f_cmp_lt(a,b) +#define graphene_simd4f_cmp_lt(a,b) graphene_msvc_simd4f_cmp_lt(a,b) static inline bool -_simd4f_cmp_lt (const graphene_simd4f_t a, +graphene_msvc_simd4f_cmp_lt (const graphene_simd4f_t a, const graphene_simd4f_t b) { __m128i __res = _mm_castps_si128 (_mm_cmplt_ps (a, b)); return (_mm_movemask_epi8 (__res) == 0xffff); } -#define graphene_simd4f_cmp_le(a,b) _simd4f_cmp_le(a,b) +#define graphene_simd4f_cmp_le(a,b) graphene_msvc_simd4f_cmp_le(a,b) static inline bool -_simd4f_cmp_le (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_cmp_le (const graphene_simd4f_t a, + const graphene_simd4f_t b) { __m128i __res = _mm_castps_si128 (_mm_cmple_ps (a, b)); return (_mm_movemask_epi8 (__res) == 0xffff); } -#define graphene_simd4f_cmp_ge(a,b) _simd4f_cmp_ge(a,b) +#define graphene_simd4f_cmp_ge(a,b) graphene_msvc_simd4f_cmp_ge(a,b) static inline bool -_simd4f_cmp_ge (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_cmp_ge (const graphene_simd4f_t a, + const graphene_simd4f_t b) { __m128i __res = _mm_castps_si128 (_mm_cmpge_ps (a, b)); return (_mm_movemask_epi8 (__res) == 0xffff); } -#define graphene_simd4f_cmp_gt(a,b) _simd4f_cmp_gt(a,b) +#define graphene_simd4f_cmp_gt(a,b) graphene_msvc_simd4f_cmp_gt(a,b) static inline bool -_simd4f_cmp_gt (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_cmp_gt (const graphene_simd4f_t a, + const graphene_simd4f_t b) { __m128i __res = _mm_castps_si128 (_mm_cmpgt_ps (a, b)); return (_mm_movemask_epi8 (__res) == 0xffff); } -#define graphene_simd4f_neg(s) _simd4f_neg(s) +#define graphene_simd4f_neg(s) graphene_msvc_simd4f_neg(s) static inline graphene_simd4f_t -_simd4f_neg (const graphene_simd4f_t s) +graphene_msvc_simd4f_neg (const graphene_simd4f_t s) { const graphene_simd4f_uif_t __mask = { { 0x80000000, From 1a7f1e38e83f8747405db943237a8f7197c3fa83 Mon Sep 17 00:00:00 2001 From: Chun-wei Fan Date: Wed, 11 May 2022 10:55:48 +0800 Subject: [PATCH 9/9] graphene-simd4f.h: Prefix MSVC implementation of ARM NEON SIMD calls ...with graphene_msvc_ instead of just _. This attempts to make things clearer to people. --- include/graphene-simd4f.h | 132 +++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/include/graphene-simd4f.h b/include/graphene-simd4f.h index 9f69dc17..f3257c89 100644 --- a/include/graphene-simd4f.h +++ b/include/graphene-simd4f.h @@ -1200,9 +1200,9 @@ typedef float32x2_t graphene_simd2f_t; #elif defined _MSC_VER /* Visual Studio ARM */ -# define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w) +# define graphene_simd4f_init(x,y,z,w) graphene_msvc_simd4f_init(x,y,z,w) static inline graphene_simd4f_t -_simd4f_init (float x, float y, float z, float w) +graphene_msvc_simd4f_init (float x, float y, float z, float w) { const float32_t __v[4] = { (x), (y), (z), (w) }; return vld1q_f32 (__v); @@ -1210,9 +1210,9 @@ _simd4f_init (float x, float y, float z, float w) # define graphene_simd4f_init_3f(v) graphene_simd4f_init (v[0], v[1], v[2], 0.f) -# define graphene_simd4f_init_2f(v) _simd4f_init_2f(v) +# define graphene_simd4f_init_2f(v) graphene_msvc_simd4f_init_2f(v) static inline graphene_simd4f_t -_simd4f_init_2f (const float *v) +graphene_msvc_simd4f_init_2f (const float *v) { const float32_t *__v32 = (const float32_t *) (v); const graphene_simd2f_t __low = vld1_f32 (__v32); @@ -1221,10 +1221,10 @@ _simd4f_init_2f (const float *v) return vcombine_f32 (__low, __high); } -# define graphene_simd4f_dup_3f(s,v) _simd4f_dup_3f(s,v) -static inline -void _simd4f_dup_3f (const graphene_simd4f_t s, - float *v) +# define graphene_simd4f_dup_3f(s,v) graphene_msvc_simd4f_dup_3f(s,v) +static inline void +graphene_msvc_simd4f_dup_3f (const graphene_simd4f_t s, + float *v) { float *__v = (v); vst1q_lane_f32 (__v++, (s), 0); @@ -1232,9 +1232,9 @@ void _simd4f_dup_3f (const graphene_simd4f_t s, vst1q_lane_f32 (__v, (s), 2); } -# define graphene_simd4f_reciprocal(s) _simd4f_reciprocal(s) +# define graphene_simd4f_reciprocal(s) graphene_msvc_simd4f_reciprocal(s) static inline graphene_simd4f_t -_simd4f_reciprocal (const graphene_simd4f_t s) +graphene_msvc_simd4f_reciprocal (const graphene_simd4f_t s) { graphene_simd4f_t __est = vrecpeq_f32 ((s)); __est = vmulq_f32 (vrecpsq_f32 (__est, (s)), __est); @@ -1242,21 +1242,21 @@ _simd4f_reciprocal (const graphene_simd4f_t s) } static inline graphene_simd4f_t -_simd4f_rsqrt_iter (const graphene_simd4f_t v, - const graphene_simd4f_t estimate) +graphene_msvc_simd4f_rsqrt_iter (const graphene_simd4f_t v, + const graphene_simd4f_t estimate) { const graphene_simd4f_t __est1 = vmulq_f32 ((estimate), (v)); return vmulq_f32 ((estimate), vrsqrtsq_f32 (__est1, (estimate))); } -# define graphene_simd4f_rsqrt(s) _simd4f_rsqrt(s) +# define graphene_simd4f_rsqrt(s) graphene_msvc_simd4f_rsqrt(s) static inline graphene_simd4f_t -_simd4f_rsqrt (const graphene_simd4f_t s) +graphene_msvc_simd4f_rsqrt (const graphene_simd4f_t s) { graphene_simd4f_t __estimate = vrsqrteq_f32 ((s)); - __estimate = _simd4f_rsqrt_iter ((s), __estimate); - __estimate = _simd4f_rsqrt_iter ((s), __estimate); - return _simd4f_rsqrt_iter ((s), __estimate); + __estimate = graphene_msvc_simd4f_rsqrt_iter ((s), __estimate); + __estimate = graphene_msvc_simd4f_rsqrt_iter ((s), __estimate); + return graphene_msvc_simd4f_rsqrt_iter ((s), __estimate); } # define graphene_simd4f_sqrt(s) _simd4f_sqrt(s) @@ -1269,10 +1269,10 @@ _simd4f_sqrt (const graphene_simd4f_t s) return vreinterpretq_f32_u32 (vandq_u32 (vtstq_u32 (__tmp, __tmp), vreinterpretq_u32_f32 (__rrsq))); } -# define graphene_simd4f_cross3(a,b) _simd4f_cross3(a,b) +# define graphene_simd4f_cross3(a,b) graphene_msvc_simd4f_cross3(a,b) static inline graphene_simd4f_t -_simd4f_cross3 (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_cross3 (const graphene_simd4f_t a, + const graphene_simd4f_t b) { const uint32_t __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; const int32x4_t __mask = vld1q_s32 ((const int32_t *) __mask_bits); @@ -1288,79 +1288,79 @@ _simd4f_cross3 (const graphene_simd4f_t a, return vandq_s32 (__s3, __mask); } -# define graphene_simd4f_dot3_scalar(a,b) _simd4f_dot3_scalar(a,b) +# define graphene_simd4f_dot3_scalar(a,b) graphene_msvc_simd4f_dot3_scalar(a,b) static inline float -_simd4f_dot3_scalar (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_dot3_scalar (const graphene_simd4f_t a, + const graphene_simd4f_t b) { const graphene_simd4f_t __m = graphene_simd4f_mul (a, b); const graphene_simd2f_t __s1 = vpadd_f32 (vget_low_f32 (__m), vget_low_f32 (__m)); return vget_lane_f32 (vadd_f32 (__s1, vget_high_f32 (__m)), 0); } -# define graphene_simd4f_shuffle_wxyz(v) _simd4f_shuffle_wxyz(v) +# define graphene_simd4f_shuffle_wxyz(v) graphene_msvc_simd4f_shuffle_wxyz(v) static inline graphene_simd4f_t -_simd4f_shuffle_wxyz (const graphene_simd4f_t v) +graphene_msvc_simd4f_shuffle_wxyz (const graphene_simd4f_t v) { graphene_simd4f_union_t __u = { (v) }; return graphene_simd4f_init (__u.f[3], __u.f[0], __u.f[1], __u.f[2]); } -# define graphene_simd4f_shuffle_zwxy(v) _simd4f_shuffle_zwxy(v) +# define graphene_simd4f_shuffle_zwxy(v) graphene_msvc_simd4f_shuffle_zwxy(v) static inline graphene_simd4f_t -_simd4f_shuffle_zwxy (const graphene_simd4f_t v) +graphene_msvc_simd4f_shuffle_zwxy (const graphene_simd4f_t v) { graphene_simd4f_union_t __u = { (v) }; return graphene_simd4f_init (__u.f[2], __u.f[3], __u.f[0], __u.f[1]); } -# define graphene_simd4f_shuffle_yzwx(v) _simd4f_shuffle_yzwx(v) +# define graphene_simd4f_shuffle_yzwx(v) graphene_msvc_simd4f_shuffle_yzwx(v) static inline graphene_simd4f_t -_simd4f_shuffle_yzwx (const graphene_simd4f_t v) +graphene_msvc_simd4f_shuffle_yzwx (const graphene_simd4f_t v) { graphene_simd4f_union_t __u = { (v) }; return graphene_simd4f_init (__u.f[1], __u.f[2], __u.f[3], __u.f[0]); } -# define graphene_simd4f_zero_w(v) _simd4f_zero_w(v) +# define graphene_simd4f_zero_w(v) graphene_msvc_simd4f_zero_w(v) static inline graphene_simd4f_t -_simd4f_zero_w (const graphene_simd4f_t v) +graphene_msvc_simd4f_zero_w (const graphene_simd4f_t v) { graphene_simd4f_union_t __u = { (v) }; return graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], 0.f); } -# define graphene_simd4f_zero_zw(v) _simd4f_zero_zw(v) +# define graphene_simd4f_zero_zw(v) graphene_msvc_simd4f_zero_zw(v) static inline graphene_simd4f_t -_simd4f_zero_zw (const graphene_simd4f_t v) +graphene_msvc_simd4f_zero_zw (const graphene_simd4f_t v) { graphene_simd4f_union_t __u = { (v) }; return graphene_simd4f_init (__u.f[0], __u.f[1], 0.f, 0.f); } -# define graphene_simd4f_merge_w(s,v) _simd4f_merge_w(s,v) +# define graphene_simd4f_merge_w(s,v) graphene_msvc_simd4f_merge_w(s,v) static inline graphene_simd4f_t -_simd4f_merge_w (const graphene_simd4f_t s, - float v) +graphene_msvc_simd4f_merge_w (const graphene_simd4f_t s, + float v) { graphene_simd4f_union_t __u = { (s) }; return graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], (v)); } -# define graphene_simd4f_merge_high(a,b) _simd4f_merge_high(a,b) +# define graphene_simd4f_merge_high(a,b) graphene_msvc_simd4f_merge_high(a,b) static inline graphene_simd4f_t -_simd4f_merge_high (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_merge_high (const graphene_simd4f_t a, + const graphene_simd4f_t b) { graphene_simd4f_union_t __u_a = { (a) }; graphene_simd4f_union_t __u_b = { (b) }; return graphene_simd4f_init (__u_a.f[2], __u_a.f[3], __u_b.f[2], __u_b.f[3]); } -# define graphene_simd4f_merge_low(a,b) _simd4f_merge_low(a,b) +# define graphene_simd4f_merge_low(a,b) graphene_msvc_simd4f_merge_low(a,b) static inline graphene_simd4f_t -_simd4f_merge_low (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_merge_low (const graphene_simd4f_t a, + const graphene_simd4f_t b) { graphene_simd4f_union_t __u_a = { (a) }; graphene_simd4f_union_t __u_b = { (b) }; @@ -1368,9 +1368,9 @@ _simd4f_merge_low (const graphene_simd4f_t a, } -# define graphene_simd4f_flip_sign_0101(s) _simd4f_flip_sign_0101(s) +# define graphene_simd4f_flip_sign_0101(s) graphene_msvc_simd4f_flip_sign_0101(s) static inline graphene_simd4f_t -_simd4f_flip_sign_0101 (const graphene_simd4f_t s) +graphene_msvc_simd4f_flip_sign_0101 (const graphene_simd4f_t s) { const unsigned int __upnpn[4] = { 0x00000000, @@ -1382,9 +1382,9 @@ _simd4f_flip_sign_0101 (const graphene_simd4f_t s) return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __pnpn)); } -# define graphene_simd4f_flip_sign_1010(s) _simd4f_flip_sign_1010(s) +# define graphene_simd4f_flip_sign_1010(s) graphene_msvc_simd4f_flip_sign_1010(s) static inline graphene_simd4f_t -_simd4f_flip_sign_1010 (const graphene_simd4f_t s) +graphene_msvc_simd4f_flip_sign_1010 (const graphene_simd4f_t s) { const unsigned int __unpnp[4] = { 0x80000000, @@ -1397,10 +1397,10 @@ _simd4f_flip_sign_1010 (const graphene_simd4f_t s) return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __npnp)); } -# define graphene_simd4f_cmp_eq(a,b) _simd4f_cmp_eq(a,b) +# define graphene_simd4f_cmp_eq(a,b) graphene_msvc_simd4f_cmp_eq(a,b) static inline bool -_simd4f_cmp_eq (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_cmp_eq (const graphene_simd4f_t a, + const graphene_simd4f_t b) { const uint32x4_t __mask = vceqq_f32 ((a), (b)); return (vgetq_lane_u32 (__mask, 0) != 0 && @@ -1409,10 +1409,10 @@ _simd4f_cmp_eq (const graphene_simd4f_t a, vgetq_lane_u32 (__mask, 3) != 0); } -# define graphene_simd4f_cmp_neq(a,b) _simd4f_cmp_neq(a,b) +# define graphene_simd4f_cmp_neq(a,b) graphene_msvc_simd4f_cmp_neq(a,b) static inline bool -_simd4f_cmp_neq (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_cmp_neq (const graphene_simd4f_t a, + const graphene_simd4f_t b) { const uint32x4_t __mask = vceqq_f32 ((a), (b)); return (vgetq_lane_u32 (__mask, 0) == 0 || @@ -1421,10 +1421,10 @@ _simd4f_cmp_neq (const graphene_simd4f_t a, vgetq_lane_u32 (__mask, 3) == 0); } -# define graphene_simd4f_cmp_lt(a,b) _simd4f_cmp_lt(a,b) +# define graphene_simd4f_cmp_lt(a,b) graphene_msvc_simd4f_cmp_lt(a,b) static inline bool -_simd4f_cmp_lt (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_cmp_lt (const graphene_simd4f_t a, + const graphene_simd4f_t b) { const uint32x4_t __mask = vcltq_f32 ((a), (b)); return (vgetq_lane_u32 (__mask, 0) != 0 && @@ -1433,10 +1433,10 @@ _simd4f_cmp_lt (const graphene_simd4f_t a, vgetq_lane_u32 (__mask, 3) != 0); } -# define graphene_simd4f_cmp_le(a,b) _simd4f_cmp_le(a,b) +# define graphene_simd4f_cmp_le(a,b) graphene_msvc_simd4f_cmp_le(a,b) static inline bool -_simd4f_cmp_le (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_cmp_le (const graphene_simd4f_t a, + const graphene_simd4f_t b) { const uint32x4_t __mask = vcleq_f32 ((a), (b)); return (vgetq_lane_u32 (__mask, 0) != 0 && @@ -1445,10 +1445,10 @@ _simd4f_cmp_le (const graphene_simd4f_t a, vgetq_lane_u32 (__mask, 3) != 0); } -# define graphene_simd4f_cmp_ge(a,b) _simd4f_cmp_ge(a,b) +# define graphene_simd4f_cmp_ge(a,b) graphene_msvc_simd4f_cmp_ge(a,b) static inline bool -_simd4f_cmp_ge (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_cmp_ge (const graphene_simd4f_t a, + const graphene_simd4f_t b) { const uint32x4_t __mask = vcgeq_f32 ((a), (b)); return (vgetq_lane_u32 (__mask, 0) != 0 && @@ -1457,10 +1457,10 @@ _simd4f_cmp_ge (const graphene_simd4f_t a, vgetq_lane_u32 (__mask, 3) != 0); } -# define graphene_simd4f_cmp_gt(a,b) _simd4f_cmp_gt(a,b) +# define graphene_simd4f_cmp_gt(a,b) graphene_msvc_simd4f_cmp_gt(a,b) static inline bool -_simd4f_cmp_gt (const graphene_simd4f_t a, - const graphene_simd4f_t b) +graphene_msvc_simd4f_cmp_gt (const graphene_simd4f_t a, + const graphene_simd4f_t b) { const uint32x4_t __mask = vcgtq_f32 ((a), (b)); return (vgetq_lane_u32 (__mask, 0) != 0 && @@ -1469,9 +1469,9 @@ _simd4f_cmp_gt (const graphene_simd4f_t a, vgetq_lane_u32 (__mask, 3) != 0); } -# define graphene_simd4f_neg(s) _simd4f_neg(s) +# define graphene_simd4f_neg(s) graphene_msvc_simd4f_neg(s) static inline graphene_simd4f_t -_simd4f_neg (const graphene_simd4f_t s) +graphene_msvc_simd4f_neg (const graphene_simd4f_t s) { const unsigned int __umask[4] = { 0x80000000,