From 663b06be078a4580e5c7e4ed52de55cbc119e3ea Mon Sep 17 00:00:00 2001
From: Niyas Sait <niyas.sait@linaro.org>
Date: Wed, 13 Oct 2021 07:04:32 +0100
Subject: [PATCH] Enable compiling arm/neon with MSVC for windows on arm64

---
 include/xsimd/arch/xsimd_neon.hpp            | 498 +++++++++----------
 include/xsimd/arch/xsimd_neon64.hpp          |  40 +-
 include/xsimd/arch/xsimd_neon_dispatcher.hpp | 119 +++++
 include/xsimd/config/xsimd_config.hpp        |   3 +
 include/xsimd/types/xsimd_batch.hpp          |   8 +-
 include/xsimd/types/xsimd_neon_register.hpp  |  31 ++
 6 files changed, 417 insertions(+), 282 deletions(-)
 create mode 100644 include/xsimd/arch/xsimd_neon_dispatcher.hpp

diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp
index e3b02018a..5e56c414a 100644
--- a/include/xsimd/arch/xsimd_neon.hpp
+++ b/include/xsimd/arch/xsimd_neon.hpp
@@ -19,52 +19,53 @@
 
 #include "../types/xsimd_neon_register.hpp"
 #include "../types/xsimd_utils.hpp"
+#include "xsimd_neon_dispatcher.hpp"
 
 // Wrap intrinsics so we can pass them as function pointers
 // - OP: intrinsics name prefix, e.g., vorrq
 // - RT: type traits to deduce intrinsics return types
-#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                                    \
-    namespace wrap {                                                                            \
-        inline RT<uint8x16_t> OP##_u8 (uint8x16_t a, uint8x16_t b) { return ::OP##_u8 (a, b); } \
-        inline RT<int8x16_t>  OP##_s8 (int8x16_t  a, int8x16_t  b) { return ::OP##_s8 (a, b); } \
-        inline RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) { return ::OP##_u16(a, b); } \
-        inline RT<int16x8_t>  OP##_s16(int16x8_t  a, int16x8_t  b) { return ::OP##_s16(a, b); } \
-        inline RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) { return ::OP##_u32(a, b); } \
-        inline RT<int32x4_t>  OP##_s32(int32x4_t  a, int32x4_t  b) { return ::OP##_s32(a, b); } \
+#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                                       \
+    namespace wrap {                                                                               \
+        inline RT<uint8x16_t> _##OP##_u8 (uint8x16_t a, uint8x16_t b) { return ::OP##_u8 (a, b); } \
+        inline RT<int8x16_t>  _##OP##_s8 (int8x16_t  a, int8x16_t  b) { return ::OP##_s8 (a, b); } \
+        inline RT<uint16x8_t> _##OP##_u16(uint16x8_t a, uint16x8_t b) { return ::OP##_u16(a, b); } \
+        inline RT<int16x8_t>  _##OP##_s16(int16x8_t  a, int16x8_t  b) { return ::OP##_s16(a, b); } \
+        inline RT<uint32x4_t> _##OP##_u32(uint32x4_t a, uint32x4_t b) { return ::OP##_u32(a, b); } \
+        inline RT<int32x4_t>  _##OP##_s32(int32x4_t  a, int32x4_t  b) { return ::OP##_s32(a, b); } \
     }
 
-#define WRAP_BINARY_INT(OP, RT)                                                                 \
-    WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                                        \
-    namespace wrap {                                                                            \
-        inline RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) { return ::OP##_u64(a, b); } \
-        inline RT<int64x2_t>  OP##_s64(int64x2_t  a, int64x2_t  b) { return ::OP##_s64(a, b); } \
+#define WRAP_BINARY_INT(OP, RT)                                                                    \
+    WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                                           \
+    namespace wrap {                                                                               \
+        inline RT<uint64x2_t> _##OP##_u64(uint64x2_t a, uint64x2_t b) { return ::OP##_u64(a, b); } \
+        inline RT<int64x2_t>  _##OP##_s64(int64x2_t  a, int64x2_t  b) { return ::OP##_s64(a, b); } \
     }
 
-#define WRAP_BINARY_FLOAT(OP, RT)                                                                  \
-    namespace wrap {                                                                               \
-        inline RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) { return ::OP##_f32(a, b); } \
+#define WRAP_BINARY_FLOAT(OP, RT)                                                                     \
+    namespace wrap {                                                                                  \
+        inline RT<float32x4_t> _##OP##_f32(float32x4_t a, float32x4_t b) { return ::OP##_f32(a, b); } \
     }
 
-#define WRAP_UNARY_INT_EXCLUDING_64(OP)                                    \
-    namespace wrap {                                                       \
-        inline uint8x16_t OP##_u8 (uint8x16_t a) { return ::OP##_u8 (a); } \
-        inline int8x16_t  OP##_s8 (int8x16_t  a) { return ::OP##_s8 (a); } \
-        inline uint16x8_t OP##_u16(uint16x8_t a) { return ::OP##_u16(a); } \
-        inline int16x8_t  OP##_s16(int16x8_t  a) { return ::OP##_s16(a); } \
-        inline uint32x4_t OP##_u32(uint32x4_t a) { return ::OP##_u32(a); } \
-        inline int32x4_t  OP##_s32(int32x4_t  a) { return ::OP##_s32(a); } \
+#define WRAP_UNARY_INT_EXCLUDING_64(OP)                                       \
+    namespace wrap {                                                          \
+        inline uint8x16_t _##OP##_u8 (uint8x16_t a) { return ::OP##_u8 (a); } \
+        inline int8x16_t  _##OP##_s8 (int8x16_t  a) { return ::OP##_s8 (a); } \
+        inline uint16x8_t _##OP##_u16(uint16x8_t a) { return ::OP##_u16(a); } \
+        inline int16x8_t  _##OP##_s16(int16x8_t  a) { return ::OP##_s16(a); } \
+        inline uint32x4_t _##OP##_u32(uint32x4_t a) { return ::OP##_u32(a); } \
+        inline int32x4_t  _##OP##_s32(int32x4_t  a) { return ::OP##_s32(a); } \
     }
 
-#define WRAP_UNARY_INT(OP)                                                 \
-    WRAP_UNARY_INT_EXCLUDING_64(OP)                                        \
-    namespace wrap {                                                       \
-        inline uint64x2_t OP##_u64(uint64x2_t a) { return ::OP##_u64(a); } \
-        inline int64x2_t  OP##_s64(int64x2_t  a) { return ::OP##_s64(a); } \
+#define WRAP_UNARY_INT(OP)                                                    \
+    WRAP_UNARY_INT_EXCLUDING_64(OP)                                           \
+    namespace wrap {                                                          \
+        inline uint64x2_t _##OP##_u64(uint64x2_t a) { return ::OP##_u64(a); } \
+        inline int64x2_t  _##OP##_s64(int64x2_t  a) { return ::OP##_s64(a); } \
     }
 
-#define WRAP_UNARY_FLOAT(OP)                                                 \
-    namespace wrap {                                                         \
-        inline float32x4_t OP##_f32(float32x4_t a) { return ::OP##_f32(a); } \
+#define WRAP_UNARY_FLOAT(OP)                                                    \
+    namespace wrap {                                                            \
+        inline float32x4_t _##OP##_f32(float32x4_t a) { return ::OP##_f32(a); } \
     }
 
 // Dummy identity caster to ease coding
@@ -155,6 +156,8 @@ namespace xsimd
                 using type = uint8x16_t;
             };
 
+// MSVC uses same underlying type for all vector variants which would cause C++ function overload ambiguity
+#if !defined(_WIN32) || (defined(__clang__))
             template <>
             struct comp_return_type_impl<int8x16_t>
             {
@@ -202,6 +205,7 @@ namespace xsimd
             {
                 using type = uint32x4_t;
             };
+#endif
 
             template <class T>
             using comp_return_type = typename comp_return_type_impl<T>::type;
@@ -314,7 +318,7 @@ namespace xsimd
         template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
         batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args)
         {
-            return xsimd::types::detail::neon_vector_type<T>{args...};
+            return INITIALIZER_LIST_TO_NEON_VECTOR(xsimd::types::detail::neon_vector_type<T>, {args...});
         }
 
         template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
@@ -322,7 +326,7 @@ namespace xsimd
         {
             using register_type = typename batch_bool<T, A>::register_type;
             using unsigned_type = as_unsigned_integer_t<T>;
-            return register_type{static_cast<unsigned_type>(args ? -1LL : 0LL)...};
+            return INITIALIZER_LIST_TO_NEON_VECTOR(register_type , {static_cast<unsigned_type>(args ? -1LL : 0LL)...});
         }
 
         template <class A>
@@ -336,7 +340,7 @@ namespace xsimd
         {
             using register_type = typename batch_bool<float, A>::register_type;
             using unsigned_type = as_unsigned_integer_t<float>;
-            return register_type{static_cast<unsigned_type>(args ? -1LL : 0LL)...};
+            return INITIALIZER_LIST_TO_NEON_VECTOR(register_type, {static_cast<unsigned_type>(args ? -1LL : 0LL)...});
         }
 
         /*************
@@ -346,13 +350,13 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
         batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>)
         {
-            return vandq_u8(arg, vdupq_n_u8(1));
+            return vandq_u8(arg.data, vdupq_n_u8(1));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
         batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>)
         {
-            return vandq_s8(reinterpret_cast<int8x16_t>(arg.data), vdupq_n_s8(1));
+            return vandq_s8(REINTERPRET_CAST(int8x16_t, arg.data), vdupq_n_s8(1));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
@@ -364,7 +368,7 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
         batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>)
         {
-            return vandq_s16(reinterpret_cast<int16x8_t>(arg.data), vdupq_n_s16(1));
+            return vandq_s16(REINTERPRET_CAST(int16x8_t, arg.data), vdupq_n_s16(1));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
@@ -376,7 +380,7 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
         batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>)
         {
-            return vandq_s32(reinterpret_cast<int32x4_t>(arg.data), vdupq_n_s32(1));
+            return vandq_s32(REINTERPRET_CAST(int32x4_t, arg.data), vdupq_n_s32(1));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
@@ -388,7 +392,7 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
         batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>)
         {
-            return vandq_s64(reinterpret_cast<int64x2_t>(arg.data), vdupq_n_s64(1));
+            return vandq_s64(REINTERPRET_CAST(int64x2_t, arg.data), vdupq_n_s64(1));
         }
 
         template <class A>
@@ -630,13 +634,11 @@ namespace xsimd
         batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_dispatcher::binary dispatcher =
-            {
-                std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16,
-                                wrap::vaddq_u32, wrap::vaddq_s32, wrap::vaddq_u64, wrap::vaddq_s64,
-                                wrap::vaddq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY(wrap::_vaddq_u8, wrap::_vaddq_s8, wrap::_vaddq_u16, wrap::_vaddq_s16,
+                                   wrap::_vaddq_u32, wrap::_vaddq_s32, wrap::_vaddq_u64, wrap::_vaddq_s64,
+                                   wrap::_vaddq_f32, T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         /********
@@ -649,13 +651,11 @@ namespace xsimd
         batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_dispatcher::binary dispatcher =
-            {
-                std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16,
-                                wrap::vqaddq_u32, wrap::vqaddq_s32, wrap::vqaddq_u64, wrap::vqaddq_s64,
-                                wrap::vaddq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY(wrap::_vqaddq_u8, wrap::_vqaddq_s8, wrap::_vqaddq_u16, wrap::_vqaddq_s16,
+                                   wrap::_vqaddq_u32, wrap::_vqaddq_s32, wrap::_vqaddq_u64, wrap::_vqaddq_s64,
+                                   wrap::_vaddq_f32, T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         /*******
@@ -669,13 +669,11 @@ namespace xsimd
         batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_dispatcher::binary dispatcher =
-            {
-                std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16,
-                                wrap::vsubq_u32, wrap::vsubq_s32, wrap::vsubq_u64, wrap::vsubq_s64,
-                                wrap::vsubq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY(wrap::_vsubq_u8, wrap::_vsubq_s8, wrap::_vsubq_u16, wrap::_vsubq_s16,
+                                   wrap::_vsubq_u32, wrap::_vsubq_s32, wrap::_vsubq_u64, wrap::_vsubq_s64,
+                                   wrap::_vsubq_f32, T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         /********
@@ -688,13 +686,11 @@ namespace xsimd
         batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_dispatcher::binary dispatcher =
-            {
-                std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16,
-                                wrap::vqsubq_u32, wrap::vqsubq_s32, wrap::vqsubq_u64, wrap::vqsubq_s64,
-                                wrap::vsubq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY(wrap::_vqsubq_u8, wrap::_vqsubq_s8, wrap::_vqsubq_u16, wrap::_vqsubq_s16,
+                                   wrap::_vqsubq_u32, wrap::_vqsubq_s32, wrap::_vqsubq_u64, wrap::_vqsubq_s64,
+                                   wrap::_vsubq_f32, T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
 
@@ -709,12 +705,11 @@ namespace xsimd
         batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_dispatcher::binary dispatcher =
-            {
-                std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16,
-                                wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vmulq_u8, wrap::_vmulq_s8, wrap::_vmulq_u16, wrap::_vmulq_s16,
+                                              wrap::_vmulq_u32, wrap::_vmulq_s32, wrap::_vmulq_f32, T,
+                                              register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         /*******
@@ -763,24 +758,27 @@ namespace xsimd
         batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_comp_dispatcher::binary dispatcher =
-            {
-                std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16,
-                                wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vceqq_u8, wrap::_vceqq_s8, wrap::_vceqq_u16, wrap::_vceqq_s16,
+                                              wrap::_vceqq_u32, wrap::_vceqq_s32, wrap::_vceqq_f32, T, 
+                                              register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
         batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch_bool<T, A>::register_type;
-            using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary;
-            const dispatcher_type dispatcher =
-            {
-                std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            switch(sizeof(T)){
+                case 1:
+                    return wrap::_vceqq_u8(register_type(lhs), register_type(rhs));
+                case 2:
+                    return wrap::_vceqq_u16(register_type(lhs), register_type(rhs));
+                case 4:
+                    return wrap::_vceqq_u32(register_type(lhs), register_type(rhs));
+                default:
+                    assert(false && "invalid size"); return {};
+            }
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
@@ -806,12 +804,11 @@ namespace xsimd
         batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_comp_dispatcher::binary dispatcher =
-            {
-                std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16,
-                                wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vcltq_u8, wrap::_vcltq_s8, wrap::_vcltq_u16, wrap::_vcltq_s16,
+                                              wrap::_vcltq_u32, wrap::_vcltq_s32, wrap::_vcltq_f32,
+                                              T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
@@ -831,12 +828,11 @@ namespace xsimd
         batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_comp_dispatcher::binary dispatcher =
-            {
-                std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16,
-                                wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vcleq_u8, wrap::_vcleq_s8, wrap::_vcleq_u16, wrap::_vcleq_s16,
+                                              wrap::_vcleq_u32, wrap::_vcleq_s32, wrap::_vcleq_f32,
+                                              T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
@@ -856,12 +852,11 @@ namespace xsimd
         batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_comp_dispatcher::binary dispatcher =
-            {
-                std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16,
-                                wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vcgtq_u8, wrap::_vcgtq_s8, wrap::_vcgtq_u16, wrap::_vcgtq_s16,
+                                              wrap::_vcgtq_u32, wrap::_vcgtq_s32, wrap::_vcgtq_f32,
+                                              T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
@@ -878,15 +873,14 @@ namespace xsimd
         WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type)
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
-        batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
+        batch_bool<T, A> get(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_comp_dispatcher::binary dispatcher =
-            {
-                std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16,
-                                wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vcgeq_u8, wrap::_vcgeq_s8, wrap::_vcgeq_u16, wrap::_vcgeq_s16,
+                                              wrap::_vcgeq_u32, wrap::_vcgeq_s32, wrap::_vcgeq_f32, T,
+                                              register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
@@ -908,32 +902,28 @@ namespace xsimd
                 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs),
                                                        vreinterpretq_u32_f32(rhs)));
             }
-
-            template <class V>
-            V bitwise_and_neon(V const& lhs, V const& rhs)
-            {
-                const neon_dispatcher::binary dispatcher =
-                {
-                    std::make_tuple(wrap::vandq_u8, wrap::vandq_s8, wrap::vandq_u16, wrap::vandq_s16,
-                                    wrap::vandq_u32, wrap::vandq_s32, wrap::vandq_u64, wrap::vandq_s64,
-                                    bitwise_and_f32)
-                };
-                return dispatcher.apply(lhs, rhs);
-            }
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY(wrap::_vandq_u8, wrap::_vandq_s8, wrap::_vandq_u16, wrap::_vandq_s16,
+                                   wrap::_vandq_u32, wrap::_vandq_s32, wrap::_vandq_u64, wrap::_vandq_s64,
+                                   detail::bitwise_and_f32, T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch_bool<T, A>::register_type;
-            return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY(wrap::_vandq_u8, wrap::_vandq_s8, wrap::_vandq_u16, wrap::_vandq_s16,
+                    wrap::_vandq_u32, wrap::_vandq_s32, wrap::_vandq_u64, wrap::_vandq_s64,
+                    detail::bitwise_and_f32, T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         /**************
@@ -949,32 +939,28 @@ namespace xsimd
                 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs),
                                                        vreinterpretq_u32_f32(rhs)));
             }
-
-            template <class V>
-            V bitwise_or_neon(V const& lhs, V const& rhs)
-            {
-                const neon_dispatcher::binary dispatcher =
-                {
-                    std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16,
-                                    wrap::vorrq_u32, wrap::vorrq_s32, wrap::vorrq_u64, wrap::vorrq_s64,
-                                    bitwise_or_f32)
-                };
-                return dispatcher.apply(lhs, rhs);
-            }
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY(wrap::_vorrq_u8, wrap::_vorrq_s8, wrap::_vorrq_u16, wrap::_vorrq_s16,
+                                   wrap::_vorrq_u32, wrap::_vorrq_s32, wrap::_vorrq_u64, wrap::_vorrq_s64,
+                                   detail::bitwise_or_f32, T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch_bool<T, A>::register_type;
-            return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY(wrap::_vorrq_u8, wrap::_vorrq_s8, wrap::_vorrq_u16, wrap::_vorrq_s16,
+                                   wrap::_vorrq_u32, wrap::_vorrq_s32, wrap::_vorrq_u64, wrap::_vorrq_s64,
+                                   detail::bitwise_or_f32, T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         /***************
@@ -991,31 +977,38 @@ namespace xsimd
                                                        vreinterpretq_u32_f32(rhs)));
             }
 
-            template <class V>
-            V bitwise_xor_neon(V const& lhs, V const& rhs)
-            {
-                const neon_dispatcher::binary dispatcher =
-                {
-                    std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16,
-                                    wrap::veorq_u32, wrap::veorq_s32, wrap::veorq_u64, wrap::veorq_s64,
-                                    bitwise_xor_f32)
-                };
-                return dispatcher.apply(lhs, rhs);
+            template <class A, class T>
+            batch<T, A> bitwise_xor_neon(batch<T, A> const& lhs, batch<T, A> const& rhs)
+            {
+                using register_type = typename batch_bool<T, A>::register_type;
+                register_type result;
+                NEON_DISPATCHER_BINARY(wrap::_veorq_u8, wrap::_veorq_s8, wrap::_veorq_u16, wrap::_veorq_s16,
+                                       wrap::_veorq_u32, wrap::_veorq_s32, wrap::_veorq_u64, wrap::_veorq_s64,
+                                       detail::bitwise_xor_f32, T, register_type(lhs), register_type(rhs), result);
+                return result;
             }
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
-            using register_type = typename batch<T, A>::register_type;
-            return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
+            using register_type = typename batch_bool<T, A>::register_type;
+            register_type result;
+            NEON_DISPATCHER_BINARY(wrap::_veorq_u8, wrap::_veorq_s8, wrap::_veorq_u16, wrap::_veorq_s16,
+                                   wrap::_veorq_u32, wrap::_veorq_s32, wrap::_veorq_u64, wrap::_veorq_s64,
+                                   detail::bitwise_xor_f32, T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch_bool<T, A>::register_type;
-            return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY(wrap::_veorq_u8, wrap::_veorq_s8, wrap::_veorq_u16, wrap::_veorq_s16,
+                                   wrap::_veorq_u32, wrap::_veorq_s32, wrap::_veorq_u64, wrap::_veorq_s64,
+                                   detail::bitwise_xor_f32, T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         /*******
@@ -1051,17 +1044,16 @@ namespace xsimd
                 return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
             }
 
-            template <class V>
-            V bitwise_not_neon(V const& arg)
-            {
-                const neon_dispatcher::unary dispatcher =
-                {
-                    std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
-                                    wrap::vmvnq_u32, wrap::vmvnq_s32,
-                                    bitwise_not_u64, bitwise_not_s64,
-                                    bitwise_not_f32)
-                };
-                return dispatcher.apply(arg);
+            template <class A, class T>
+            batch<T, A> bitwise_not_neon(batch<T, A> const& arg)
+            {
+                using register_type = typename batch<T, A>::register_type;
+                register_type result;
+                NEON_DISPATCHER_UNARY(wrap::_vmvnq_u8, wrap::_vmvnq_s8, wrap::_vmvnq_u16, wrap::_vmvnq_s16,
+                                      wrap::_vmvnq_u32, wrap::_vmvnq_s32,
+                                      bitwise_not_u64, bitwise_not_s64,
+                                      bitwise_not_f32, T, register_type(arg), result);
+                return result;
             }
         }
 
@@ -1069,14 +1061,24 @@ namespace xsimd
         batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            return detail::bitwise_not_neon(register_type(arg));
+            register_type result;
+            NEON_DISPATCHER_UNARY(wrap::_vmvnq_u8, wrap::_vmvnq_s8, wrap::_vmvnq_u16, wrap::_vmvnq_s16,
+                                  wrap::_vmvnq_u32, wrap::_vmvnq_s32,
+                                  detail::bitwise_not_u64, detail::bitwise_not_s64,
+                                  detail::bitwise_not_f32, T, register_type(arg), result);
+            return result;
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>)
         {
             using register_type = typename batch_bool<T, A>::register_type;
-            return detail::bitwise_not_neon(register_type(arg));
+            register_type result;
+            NEON_DISPATCHER_UNARY(wrap::_vmvnq_u8, wrap::_vmvnq_s8, wrap::_vmvnq_u16, wrap::_vmvnq_s16,
+                                wrap::_vmvnq_u32, wrap::_vmvnq_s32,
+                                detail::bitwise_not_u64, detail::bitwise_not_s64,
+                                detail::bitwise_not_f32, T, register_type(arg), result);
+            return result;
         }
 
         /******************
@@ -1091,32 +1093,28 @@ namespace xsimd
             {
                 return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs)));
             }
-
-            template <class V>
-            V bitwise_andnot_neon(V const& lhs, V const& rhs)
-            {
-                const detail::neon_dispatcher::binary dispatcher =
-                {
-                    std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16,
-                                    wrap::vbicq_u32, wrap::vbicq_s32, wrap::vbicq_u64, wrap::vbicq_s64,
-                                    bitwise_andnot_f32)
-                };
-                return dispatcher.apply(lhs, rhs);
-            }
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY(wrap::_vbicq_u8, wrap::_vbicq_s8, wrap::_vbicq_u16, wrap::_vbicq_s16,
+                                   wrap::_vbicq_u32, wrap::_vbicq_s32, wrap::_vbicq_u64, wrap::_vbicq_s64,
+                                   detail::bitwise_andnot_f32, T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch_bool<T, A>::register_type;
-            return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY(wrap::_vbicq_u8, wrap::_vbicq_s8, wrap::_vbicq_u16, wrap::_vbicq_s16,
+                                   wrap::_vbicq_u32, wrap::_vbicq_s32, wrap::_vbicq_u64, wrap::_vbicq_s64,
+                                   detail::bitwise_andnot_f32, T, register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         /*******
@@ -1130,12 +1128,11 @@ namespace xsimd
         batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_dispatcher::binary dispatcher = 
-            {
-                std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16,
-                                wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vminq_u8, wrap::_vminq_s8, wrap::_vminq_u16, wrap::_vminq_s16,
+                                              wrap::_vminq_u32, wrap::_vminq_s32, wrap::_vminq_f32, T,
+                                              register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
@@ -1155,12 +1152,11 @@ namespace xsimd
         batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_dispatcher::binary dispatcher = 
-            {
-                std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16,
-                                wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            register_type result;
+            NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vmaxq_u8, wrap::_vmaxq_s8, wrap::_vmaxq_u16, wrap::_vmaxq_s16,
+                                              wrap::_vmaxq_u32, wrap::_vmaxq_s32, wrap::_vmaxq_f32, T,
+                                              register_type(lhs), register_type(rhs), result);
+            return result;
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
@@ -1174,9 +1170,9 @@ namespace xsimd
          *******/
 
         namespace wrap {
-            inline int8x16_t vabsq_s8 (int8x16_t a) { return ::vabsq_s8 (a); }
-            inline int16x8_t vabsq_s16(int16x8_t a) { return ::vabsq_s16(a); }
-            inline int32x4_t vabsq_s32(int32x4_t a) { return ::vabsq_s32(a); }
+            inline int8x16_t _vabsq_s8 (int8x16_t a) { return vabsq_s8 (a); }
+            inline int16x8_t _vabsq_s16(int16x8_t a) { return vabsq_s16(a); }
+            inline int32x4_t _vabsq_s32(int32x4_t a) { return vabsq_s32(a); }
         }
         WRAP_UNARY_FLOAT(vabsq)
 
@@ -1202,12 +1198,11 @@ namespace xsimd
         batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>)
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_dispatcher::unary dispatcher = 
-            {
-                std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16,
-                                detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32)
-            };
-            return dispatcher.apply(register_type(arg));
+            register_type result;
+            NEON_DISPATCHER_UNARY_EXCLUDE_64(detail::abs_u8, wrap::_vabsq_s8, detail::abs_u16, wrap::_vabsq_s16,
+                                             detail::abs_u32, wrap::_vabsq_s32, wrap::_vabsq_f32, T, register_type(arg), 
+                                             result);
+            return result;
         }
 
         /********
@@ -1349,15 +1344,15 @@ namespace xsimd
          **********/
 
         namespace wrap {
-            inline uint8x16_t  vbslq_u8 (uint8x16_t a, uint8x16_t  b, uint8x16_t  c) { return ::vbslq_u8 (a, b, c); }
-            inline int8x16_t   vbslq_s8 (uint8x16_t a, int8x16_t   b, int8x16_t   c) { return ::vbslq_s8 (a, b, c); }
-            inline uint16x8_t  vbslq_u16(uint16x8_t a, uint16x8_t  b, uint16x8_t  c) { return ::vbslq_u16(a, b, c); }
-            inline int16x8_t   vbslq_s16(uint16x8_t a, int16x8_t   b, int16x8_t   c) { return ::vbslq_s16(a, b, c); }
-            inline uint32x4_t  vbslq_u32(uint32x4_t a, uint32x4_t  b, uint32x4_t  c) { return ::vbslq_u32(a, b, c); }
-            inline int32x4_t   vbslq_s32(uint32x4_t a, int32x4_t   b, int32x4_t   c) { return ::vbslq_s32(a, b, c); }
-            inline uint64x2_t  vbslq_u64(uint64x2_t a, uint64x2_t  b, uint64x2_t  c) { return ::vbslq_u64(a, b, c); }
-            inline int64x2_t   vbslq_s64(uint64x2_t a, int64x2_t   b, int64x2_t   c) { return ::vbslq_s64(a, b, c); }
-            inline float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) { return ::vbslq_f32(a, b, c); }
+            inline uint8x16_t  _vbslq_u8 (uint8x16_t a, uint8x16_t  b, uint8x16_t  c) { return ::vbslq_u8 (a, b, c); }
+            inline int8x16_t   _vbslq_s8 (uint8x16_t a, int8x16_t   b, int8x16_t   c) { return ::vbslq_s8 (a, b, c); }
+            inline uint16x8_t  _vbslq_u16(uint16x8_t a, uint16x8_t  b, uint16x8_t  c) { return ::vbslq_u16(a, b, c); }
+            inline int16x8_t   _vbslq_s16(uint16x8_t a, int16x8_t   b, int16x8_t   c) { return ::vbslq_s16(a, b, c); }
+            inline uint32x4_t  _vbslq_u32(uint32x4_t a, uint32x4_t  b, uint32x4_t  c) { return ::vbslq_u32(a, b, c); }
+            inline int32x4_t   _vbslq_s32(uint32x4_t a, int32x4_t   b, int32x4_t   c) { return ::vbslq_s32(a, b, c); }
+            inline uint64x2_t  _vbslq_u64(uint64x2_t a, uint64x2_t  b, uint64x2_t  c) { return ::vbslq_u64(a, b, c); }
+            inline int64x2_t   _vbslq_s64(uint64x2_t a, int64x2_t   b, int64x2_t   c) { return ::vbslq_s64(a, b, c); }
+            inline float32x4_t _vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) { return ::vbslq_f32(a, b, c); }
         }
 
         namespace detail
@@ -1389,13 +1384,12 @@ namespace xsimd
         {
             using bool_register_type = typename batch_bool<T, A>::register_type;
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_select_dispatcher dispatcher =
-            {
-                std::make_tuple(wrap::vbslq_u8, wrap::vbslq_s8, wrap::vbslq_u16, wrap::vbslq_s16,
-                                wrap::vbslq_u32, wrap::vbslq_s32, wrap::vbslq_u64, wrap::vbslq_s64,
-                                wrap::vbslq_f32)
-            };
-            return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b));
+            register_type result;
+            NEON_DISPATCHER_SELECT(wrap::_vbslq_u8, wrap::_vbslq_s8, wrap::_vbslq_u16, wrap::_vbslq_s16,
+                                   wrap::_vbslq_u32, wrap::_vbslq_s32, wrap::_vbslq_u64, wrap::_vbslq_s64,
+                                   wrap::_vbslq_f32, T, bool_register_type(cond), register_type(a),
+                                   register_type(b), result);
+            return result;
         }
 
         template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0>
@@ -2130,15 +2124,15 @@ namespace xsimd
 
         #define WRAP_CAST(SUFFIX, TYPE)                                                                               \
             namespace wrap {                                                                                          \
-                inline TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t   a) { return ::vreinterpretq_##SUFFIX##_u8 (a); } \
-                inline TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t    a) { return ::vreinterpretq_##SUFFIX##_s8 (a); } \
-                inline TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t  a) { return ::vreinterpretq_##SUFFIX##_u16(a); } \
-                inline TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t   a) { return ::vreinterpretq_##SUFFIX##_s16(a); } \
-                inline TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t  a) { return ::vreinterpretq_##SUFFIX##_u32(a); } \
-                inline TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t   a) { return ::vreinterpretq_##SUFFIX##_s32(a); } \
-                inline TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t  a) { return ::vreinterpretq_##SUFFIX##_u64(a); } \
-                inline TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t   a) { return ::vreinterpretq_##SUFFIX##_s64(a); } \
-                inline TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) { return ::vreinterpretq_##SUFFIX##_f32(a); } \
+                inline TYPE _vreinterpretq_##SUFFIX##_u8(uint8x16_t   a) { return vreinterpretq_##SUFFIX##_u8 (a); } \
+                inline TYPE _vreinterpretq_##SUFFIX##_s8(int8x16_t    a) { return vreinterpretq_##SUFFIX##_s8 (a); } \
+                inline TYPE _vreinterpretq_##SUFFIX##_u16(uint16x8_t  a) { return vreinterpretq_##SUFFIX##_u16(a); } \
+                inline TYPE _vreinterpretq_##SUFFIX##_s16(int16x8_t   a) { return vreinterpretq_##SUFFIX##_s16(a); } \
+                inline TYPE _vreinterpretq_##SUFFIX##_u32(uint32x4_t  a) { return vreinterpretq_##SUFFIX##_u32(a); } \
+                inline TYPE _vreinterpretq_##SUFFIX##_s32(int32x4_t   a) { return vreinterpretq_##SUFFIX##_s32(a); } \
+                inline TYPE _vreinterpretq_##SUFFIX##_u64(uint64x2_t  a) { return vreinterpretq_##SUFFIX##_u64(a); } \
+                inline TYPE _vreinterpretq_##SUFFIX##_s64(int64x2_t   a) { return vreinterpretq_##SUFFIX##_s64(a); } \
+                inline TYPE _vreinterpretq_##SUFFIX##_f32(float32x4_t a) { return vreinterpretq_##SUFFIX##_f32(a); } \
             }
 
         WRAP_CAST(u8, uint8x16_t)
@@ -2212,33 +2206,33 @@ namespace xsimd
         {
             const detail::neon_bitwise_caster caster = {
                 std::make_tuple(
-                detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8,  wrap::vreinterpretq_u8_s8,  wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16,
-                                                 wrap::vreinterpretq_u8_u32, wrap::vreinterpretq_u8_s32, wrap::vreinterpretq_u8_u64, wrap::vreinterpretq_u8_s64,
-                                                 wrap::vreinterpretq_u8_f32),
-                detail::make_bitwise_caster_impl(wrap::vreinterpretq_s8_u8,  wrap::vreinterpretq_s8_s8,  wrap::vreinterpretq_s8_u16, wrap::vreinterpretq_s8_s16,
-                                                 wrap::vreinterpretq_s8_u32, wrap::vreinterpretq_s8_s32, wrap::vreinterpretq_s8_u64, wrap::vreinterpretq_s8_s64,
-                                                 wrap::vreinterpretq_s8_f32),
-                detail::make_bitwise_caster_impl(wrap::vreinterpretq_u16_u8,  wrap::vreinterpretq_u16_s8,  wrap::vreinterpretq_u16_u16, wrap::vreinterpretq_u16_s16,
-                                                 wrap::vreinterpretq_u16_u32, wrap::vreinterpretq_u16_s32, wrap::vreinterpretq_u16_u64, wrap::vreinterpretq_u16_s64,
-                                                 wrap::vreinterpretq_u16_f32),
-                detail::make_bitwise_caster_impl(wrap::vreinterpretq_s16_u8,  wrap::vreinterpretq_s16_s8,  wrap::vreinterpretq_s16_u16, wrap::vreinterpretq_s16_s16,
-                                                 wrap::vreinterpretq_s16_u32, wrap::vreinterpretq_s16_s32, wrap::vreinterpretq_s16_u64, wrap::vreinterpretq_s16_s64,
-                                                 wrap::vreinterpretq_s16_f32),
-                detail::make_bitwise_caster_impl(wrap::vreinterpretq_u32_u8,  wrap::vreinterpretq_u32_s8,  wrap::vreinterpretq_u32_u16, wrap::vreinterpretq_u32_s16,
-                                                 wrap::vreinterpretq_u32_u32, wrap::vreinterpretq_u32_s32, wrap::vreinterpretq_u32_u64, wrap::vreinterpretq_u32_s64,
-                                                 wrap::vreinterpretq_u32_f32),
-                detail::make_bitwise_caster_impl(wrap::vreinterpretq_s32_u8,  wrap::vreinterpretq_s32_s8,  wrap::vreinterpretq_s32_u16, wrap::vreinterpretq_s32_s16,
-                                                 wrap::vreinterpretq_s32_u32, wrap::vreinterpretq_s32_s32, wrap::vreinterpretq_s32_u64, wrap::vreinterpretq_s32_s64,
-                                                 wrap::vreinterpretq_s32_f32),
-                detail::make_bitwise_caster_impl(wrap::vreinterpretq_u64_u8,  wrap::vreinterpretq_u64_s8,  wrap::vreinterpretq_u64_u16, wrap::vreinterpretq_u64_s16,
-                                                 wrap::vreinterpretq_u64_u32, wrap::vreinterpretq_u64_s32, wrap::vreinterpretq_u64_u64, wrap::vreinterpretq_u64_s64,
-                                                 wrap::vreinterpretq_u64_f32),
-                detail::make_bitwise_caster_impl(wrap::vreinterpretq_s64_u8,  wrap::vreinterpretq_s64_s8,  wrap::vreinterpretq_s64_u16, wrap::vreinterpretq_s64_s16,
-                                                 wrap::vreinterpretq_s64_u32, wrap::vreinterpretq_s64_s32, wrap::vreinterpretq_s64_u64, wrap::vreinterpretq_s64_s64,
-                                                 wrap::vreinterpretq_s64_f32),
-                detail::make_bitwise_caster_impl(wrap::vreinterpretq_f32_u8,  wrap::vreinterpretq_f32_s8,  wrap::vreinterpretq_f32_u16, wrap::vreinterpretq_f32_s16,
-                                                 wrap::vreinterpretq_f32_u32, wrap::vreinterpretq_f32_s32, wrap::vreinterpretq_f32_u64, wrap::vreinterpretq_f32_s64,
-                                                 wrap::vreinterpretq_f32_f32))
+                detail::make_bitwise_caster_impl(wrap::_vreinterpretq_u8_u8,  wrap::_vreinterpretq_u8_s8,  wrap::_vreinterpretq_u8_u16, wrap::_vreinterpretq_u8_s16,
+                                                 wrap::_vreinterpretq_u8_u32, wrap::_vreinterpretq_u8_s32, wrap::_vreinterpretq_u8_u64, wrap::_vreinterpretq_u8_s64,
+                                                 wrap::_vreinterpretq_u8_f32),
+                detail::make_bitwise_caster_impl(wrap::_vreinterpretq_s8_u8,  wrap::_vreinterpretq_s8_s8,  wrap::_vreinterpretq_s8_u16, wrap::_vreinterpretq_s8_s16,
+                                                 wrap::_vreinterpretq_s8_u32, wrap::_vreinterpretq_s8_s32, wrap::_vreinterpretq_s8_u64, wrap::_vreinterpretq_s8_s64,
+                                                 wrap::_vreinterpretq_s8_f32),
+                detail::make_bitwise_caster_impl(wrap::_vreinterpretq_u16_u8,  wrap::_vreinterpretq_u16_s8,  wrap::_vreinterpretq_u16_u16, wrap::_vreinterpretq_u16_s16,
+                                                 wrap::_vreinterpretq_u16_u32, wrap::_vreinterpretq_u16_s32, wrap::_vreinterpretq_u16_u64, wrap::_vreinterpretq_u16_s64,
+                                                 wrap::_vreinterpretq_u16_f32),
+                detail::make_bitwise_caster_impl(wrap::_vreinterpretq_s16_u8,  wrap::_vreinterpretq_s16_s8,  wrap::_vreinterpretq_s16_u16, wrap::_vreinterpretq_s16_s16,
+                                                 wrap::_vreinterpretq_s16_u32, wrap::_vreinterpretq_s16_s32, wrap::_vreinterpretq_s16_u64, wrap::_vreinterpretq_s16_s64,
+                                                 wrap::_vreinterpretq_s16_f32),
+                detail::make_bitwise_caster_impl(wrap::_vreinterpretq_u32_u8,  wrap::_vreinterpretq_u32_s8,  wrap::_vreinterpretq_u32_u16, wrap::_vreinterpretq_u32_s16,
+                                                 wrap::_vreinterpretq_u32_u32, wrap::_vreinterpretq_u32_s32, wrap::_vreinterpretq_u32_u64, wrap::_vreinterpretq_u32_s64,
+                                                 wrap::_vreinterpretq_u32_f32),
+                detail::make_bitwise_caster_impl(wrap::_vreinterpretq_s32_u8,  wrap::_vreinterpretq_s32_s8,  wrap::_vreinterpretq_s32_u16, wrap::_vreinterpretq_s32_s16,
+                                                 wrap::_vreinterpretq_s32_u32, wrap::_vreinterpretq_s32_s32, wrap::_vreinterpretq_s32_u64, wrap::_vreinterpretq_s32_s64,
+                                                 wrap::_vreinterpretq_s32_f32),
+                detail::make_bitwise_caster_impl(wrap::_vreinterpretq_u64_u8,  wrap::_vreinterpretq_u64_s8,  wrap::_vreinterpretq_u64_u16, wrap::_vreinterpretq_u64_s16,
+                                                 wrap::_vreinterpretq_u64_u32, wrap::_vreinterpretq_u64_s32, wrap::_vreinterpretq_u64_u64, wrap::_vreinterpretq_u64_s64,
+                                                 wrap::_vreinterpretq_u64_f32),
+                detail::make_bitwise_caster_impl(wrap::_vreinterpretq_s64_u8,  wrap::_vreinterpretq_s64_s8,  wrap::_vreinterpretq_s64_u16, wrap::_vreinterpretq_s64_s16,
+                                                 wrap::_vreinterpretq_s64_u32, wrap::_vreinterpretq_s64_s32, wrap::_vreinterpretq_s64_u64, wrap::_vreinterpretq_s64_s64,
+                                                 wrap::_vreinterpretq_s64_f32),
+                detail::make_bitwise_caster_impl(wrap::_vreinterpretq_f32_u8,  wrap::_vreinterpretq_f32_s8,  wrap::_vreinterpretq_f32_u16, wrap::_vreinterpretq_f32_s16,
+                                                 wrap::_vreinterpretq_f32_u32, wrap::_vreinterpretq_f32_s32, wrap::_vreinterpretq_f32_u64, wrap::_vreinterpretq_f32_s64,
+                                                 wrap::_vreinterpretq_f32_f32))
             };
             using src_register_type = typename batch<T, A>::register_type;
             using dst_register_type = typename batch<R, A>::register_type;
diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp
index f63254adf..a958c9c29 100644
--- a/include/xsimd/arch/xsimd_neon64.hpp
+++ b/include/xsimd/arch/xsimd_neon64.hpp
@@ -18,6 +18,7 @@
 
 #include "../types/xsimd_neon64_register.hpp"
 #include "../types/xsimd_utils.hpp"
+#include "xsimd_neon_dispatcher.hpp"
 
 namespace xsimd
 {
@@ -779,8 +780,8 @@ namespace xsimd
 
         #define WRAP_CAST(SUFFIX, TYPE)                                                                               \
             namespace wrap {                                                                                          \
-                inline float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) { return ::vreinterpretq_f64_##SUFFIX(a); }     \
-                inline TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) { return ::vreinterpretq_##SUFFIX##_f64(a); } \
+                inline float64x2_t _vreinterpretq_f64_##SUFFIX(TYPE a) { return vreinterpretq_f64_##SUFFIX(a); }      \
+                inline TYPE _vreinterpretq_##SUFFIX##_f64(float64x2_t a) { return vreinterpretq_##SUFFIX##_f64(a); }  \
             }
 
         WRAP_CAST(u8, uint8x16_t)
@@ -798,19 +799,13 @@ namespace xsimd
         template <class A, class T>
         batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<neon64>)
         {
-            using caster_type = detail::bitwise_caster_impl<float64x2_t,
-                                                            uint8x16_t, int8x16_t,
-                                                            uint16x8_t, int16x8_t,
-                                                            uint32x4_t, int32x4_t,
-                                                            uint64x2_t, int64x2_t,
-                                                            float32x4_t>;
-            const caster_type caster = {
-                std::make_tuple(wrap::vreinterpretq_f64_u8,  wrap::vreinterpretq_f64_s8,  wrap::vreinterpretq_f64_u16, wrap::vreinterpretq_f64_s16,
-                                wrap::vreinterpretq_f64_u32, wrap::vreinterpretq_f64_s32, wrap::vreinterpretq_f64_u64, wrap::vreinterpretq_f64_s64,
-                                wrap::vreinterpretq_f64_f32)
-            };
             using register_type = typename batch<T, A>::register_type;
-            return caster.apply(register_type(arg));
+            register_type result;
+            NEON_DISPATCHER_UNARY(wrap::_vreinterpretq_f64_u8,  wrap::_vreinterpretq_f64_s8,  wrap::_vreinterpretq_f64_u16,
+                                  wrap::_vreinterpretq_f64_s16, wrap::_vreinterpretq_f64_u32, wrap::_vreinterpretq_f64_s32,
+                                  wrap::_vreinterpretq_f64_u64, wrap::_vreinterpretq_f64_s64, wrap::_vreinterpretq_f64_f32,
+                                  T, register_type(arg), result);
+            return result;
         }
 
         namespace detail
@@ -834,20 +829,13 @@ namespace xsimd
         template <class A, class R>
         batch<R, A> bitwise_cast(batch<double, A> const& arg, batch<R, A> const&, requires_arch<neon64>)
         {
-            using caster_type = detail::bitwise_caster_neon64<float64x2_t,
-                                                              uint8x16_t, int8x16_t,
-                                                              uint16x8_t, int16x8_t,
-                                                              uint32x4_t, int32x4_t,
-                                                              uint64x2_t, int64x2_t,
-                                                              float32x4_t>;
-            const caster_type caster = {
-                std::make_tuple(wrap::vreinterpretq_u8_f64,  wrap::vreinterpretq_s8_f64,  wrap::vreinterpretq_u16_f64, wrap::vreinterpretq_s16_f64,
-                                wrap::vreinterpretq_u32_f64, wrap::vreinterpretq_s32_f64, wrap::vreinterpretq_u64_f64, wrap::vreinterpretq_s64_f64,
-                                wrap::vreinterpretq_f32_f64)
-            };
             using src_register_type = typename batch<double, A>::register_type;
             using dst_register_type = typename batch<R, A>::register_type;
-            return caster.apply<dst_register_type>(src_register_type(arg));
+            src_register_type result;
+            NEON_DISPATCHER_UNARY(wrap::_vreinterpretq_u8_f64,  wrap::_vreinterpretq_s8_f64,  wrap::_vreinterpretq_u16_f64, wrap::_vreinterpretq_s16_f64,
+                                     wrap::_vreinterpretq_u32_f64, wrap::_vreinterpretq_s32_f64, wrap::_vreinterpretq_u64_f64, wrap::_vreinterpretq_s64_f64,
+                                     wrap::_vreinterpretq_f32_f64, R, src_register_type(arg), result);
+            return dst_register_type(result);
         }
 
         template <class A>
diff --git a/include/xsimd/arch/xsimd_neon_dispatcher.hpp b/include/xsimd/arch/xsimd_neon_dispatcher.hpp
new file mode 100644
index 000000000..cd9cf4e55
--- /dev/null
+++ b/include/xsimd/arch/xsimd_neon_dispatcher.hpp
@@ -0,0 +1,119 @@
+/***************************************************************************
+* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+* Martin Renou                                                             *
+* Copyright (c) QuantStack                                                 *
+* Copyright (c) Serge Guelton                                              *
+*                                                                          *
+* Distributed under the terms of the BSD 3-Clause License.                 *
+*                                                                          *
+* The full license is in the file LICENSE, distributed with this software. *
+****************************************************************************/
+
+// Few macros to select neon intrinsic function based on the scalar type
+#define NEON_DISPATCHER_BINARY(U8, S8, U16, S16, U32, S32, U64, S64, F32, type, arg1, arg2, result)\
+    if (std::is_same<type, uint8_t>::value) {\
+        result = U8(arg1, arg2);\
+    } else if(std::is_same<type, int8_t>::value) {\
+        result = S8(arg1, arg2);\
+    } else if(std::is_same<type, uint16_t>::value) {\
+        result = U16(arg1, arg2);\
+    } else if(std::is_same<type, int16_t>::value) {\
+        result = S16(arg1, arg2);\
+    } else if(std::is_same<type, uint32_t>::value) {\
+        result = U32(arg1, arg2);\
+    } else if(std::is_same<type, int32_t>::value) {\
+        result = S32(arg1, arg2);\
+    } else if(std::is_same<type, uint64_t>::value) {\
+        result = U64(arg1, arg2);\
+    } else if(std::is_same<type, int64_t>::value) {\
+        result = S64(arg1, arg2);\
+    } else if(std::is_same<type, float32_t>::value) {\
+        result = F32(arg1, arg2);\
+    } else {\
+        assert(false && "unsupported type");\
+    }
+
+#define NEON_DISPATCHER_BINARY_EXCLUDE_64(U8, S8, U16, S16, U32, S32, F32, type, arg1, arg2, result)\
+    if (std::is_same<type, uint8_t>::value) {\
+        result = U8(arg1, arg2);\
+    } else if(std::is_same<type, int8_t>::value) {\
+        result = S8(arg1, arg2);\
+    } else if(std::is_same<type, uint16_t>::value) {\
+        result = U16(arg1, arg2);\
+    } else if(std::is_same<type, int16_t>::value) {\
+        result = S16(arg1, arg2);\
+    } else if(std::is_same<type, uint32_t>::value) {\
+        result = U32(arg1, arg2);\
+    } else if(std::is_same<type, int32_t>::value) {\
+        result = S32(arg1, arg2);\
+    } else if(std::is_same<type, float32_t>::value) {\
+        result = F32(arg1, arg2);\
+    } else {\
+        assert(false && "unsupported type");\
+    }
+
+#define NEON_DISPATCHER_UNARY(U8, S8, U16, S16, U32, S32, U64, S64, F32, type, arg, result)\
+    if (std::is_same<type, uint8_t>::value) {\
+        result = U8(arg);\
+    } else if(std::is_same<type, int8_t>::value) {\
+        result = S8(arg);\
+    } else if(std::is_same<type, uint16_t>::value) {\
+        result = U16(arg);\
+    } else if(std::is_same<type, int16_t>::value) {\
+        result = S16(arg);\
+    } else if(std::is_same<type, uint32_t>::value) {\
+        result = U32(arg);\
+    } else if(std::is_same<type, int32_t>::value) {\
+        result = S32(arg);\
+    } else if(std::is_same<type, uint64_t>::value) {\
+        result = U64(arg);\
+    } else if(std::is_same<type, int64_t>::value) {\
+        result = S64(arg);\
+    } else if(std::is_same<type, float32_t>::value) {\
+        result = F32(arg);\
+    } else {\
+        assert(false && "unsupported type");\
+    }
+
+#define NEON_DISPATCHER_UNARY_EXCLUDE_64(U8, S8, U16, S16, U32, S32, F32, type, arg, result)\
+    if (std::is_same<type, uint8_t>::value) {\
+        result = U8(arg);\
+    } else if(std::is_same<type, int8_t>::value) {\
+        result = S8(arg);\
+    } else if(std::is_same<type, uint16_t>::value) {\
+        result = U16(arg);\
+    } else if(std::is_same<type, int16_t>::value) {\
+        result = S16(arg);\
+    } else if(std::is_same<type, uint32_t>::value) {\
+        result = U32(arg);\
+    } else if(std::is_same<type, int32_t>::value) {\
+        result = S32(arg);\
+    } else if(std::is_same<type, float32_t>::value) {\
+        result = F32(arg);\
+    } else {\
+        assert(false && "unsupported type");\
+    }
+
+#define NEON_DISPATCHER_SELECT(U8, S8, U16, S16, U32, S32, U64, S64, F32, type, cond, arg1, arg2, result)\
+    if (std::is_same<type, uint8_t>::value) {\
+        result = U8(cond, arg1, arg2);\
+    } else if(std::is_same<type, int8_t>::value) {\
+        result = S8(cond, arg1, arg2);\
+    } else if(std::is_same<type, uint16_t>::value) {\
+        result = U16(cond, arg1, arg2);\
+    } else if(std::is_same<type, int16_t>::value) {\
+        result = S16(cond, arg1, arg2);\
+    } else if(std::is_same<type, uint32_t>::value) {\
+        result = U32(cond, arg1, arg2);\
+    } else if(std::is_same<type, int32_t>::value) {\
+        result = S32(cond, arg1, arg2);\
+    } else if(std::is_same<type, uint64_t>::value) {\
+        result = U64(cond, arg1, arg2);\
+    } else if(std::is_same<type, int64_t>::value) {\
+        result = S64(cond, arg1, arg2);\
+    } else if(std::is_same<type, float32_t>::value) {\
+        result = F32(cond, arg1, arg2);\
+    } else {\
+        assert(false && "unsupported type");\
+    }
+
diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp
index a65726365..fac02d068 100644
--- a/include/xsimd/config/xsimd_config.hpp
+++ b/include/xsimd/config/xsimd_config.hpp
@@ -213,6 +213,9 @@
     #else
         #define XSIMD_WITH_NEON64 0
     #endif
+#elif defined(_MSC_VER) && defined(_M_ARM64)
+    #define XSIMD_WITH_NEON     1
+    #define XSIMD_WITH_NEON64   1
 #else
     #define XSIMD_WITH_NEON 0
     #define XSIMD_WITH_NEON64 0
diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp
index f0481de87..7366100c2 100644
--- a/include/xsimd/types/xsimd_batch.hpp
+++ b/include/xsimd/types/xsimd_batch.hpp
@@ -183,7 +183,7 @@ namespace xsimd
     private:
   
         template<size_t... Is>
-        batch(T const* data, detail::index_sequence<Is...>);
+        batch(T const* data, xsimd::detail::index_sequence<Is...>);
 
         batch logical_and(batch const& other) const;
         batch logical_or(batch const& other) const;
@@ -242,13 +242,13 @@ namespace xsimd
     private:
 
         template<size_t... Is>
-        batch_bool(bool const* data, detail::index_sequence<Is...>);
+        batch_bool(bool const* data, xsimd::detail::index_sequence<Is...>);
 
         template <class U, class... V, size_t I, size_t... Is>
-        static register_type make_register(detail::index_sequence<I, Is...>, U u, V... v);
+        static register_type make_register(xsimd::detail::index_sequence<I, Is...>, U u, V... v);
 
         template <class... V>
-        static register_type make_register(detail::index_sequence<>, V... v);
+        static register_type make_register(xsimd::detail::index_sequence<>, V... v);
     };
 
     template <class T, class A>
diff --git a/include/xsimd/types/xsimd_neon_register.hpp b/include/xsimd/types/xsimd_neon_register.hpp
index 43a7db442..6e522f5b4 100644
--- a/include/xsimd/types/xsimd_neon_register.hpp
+++ b/include/xsimd/types/xsimd_neon_register.hpp
@@ -150,6 +150,37 @@ namespace xsimd
             : detail::neon_bool_simd_register<T, neon>
         {
         };
+
+        // Few macros and function to support MSVC
+        #if defined(_MSC_VER) && !defined(__clang__)
+            #define INITIALIZER_LIST_TO_NEON_VECTOR(T, args) (neon_vector_initializer_constructor<T>(args))
+            // Convert an initialiser list to neon vector type 
+            // Note: MSVC does not provide a initialiser_list constructor for neon vector type.
+            template<class S, class T>
+            S neon_vector_initializer_constructor(std::initializer_list<T> data){
+                S target;
+                if (std::is_signed<T>::value) {
+                    switch(data.size()) {
+                        case 16: std::copy(data.begin(), data.end(), target.n128_i8); break;
+                        case 8:  std::copy(data.begin(), data.end(), target.n128_i16); break;
+                        case 4:  std::copy(data.begin(), data.end(), target.n128_i32); break;
+                        case 2:  std::copy(data.begin(), data.end(), target.n128_i64); break;
+                    }
+                } else {
+                    switch(data.size()) {
+                        case 16: std::copy(data.begin(), data.end(), target.n128_u8); break;
+                        case 8:  std::copy(data.begin(), data.end(), target.n128_u16); break;
+                        case 4:  std::copy(data.begin(), data.end(), target.n128_u32); break;
+                        case 2:  std::copy(data.begin(), data.end(), target.n128_u64); break;
+                    }
+                }
+                return target;
+            }
+            #define REINTERPRET_CAST(T, R) (R)
+        #else
+            #define INITIALIZER_LIST_TO_NEON_VECTOR(T, args) (T args)
+            #define REINTERPRET_CAST(T, R) reinterpret_cast<T>(R)
+        #endif
         
     }
 #endif