diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml
index f89d461..e60bbd8 100644
--- a/.github/workflows/build-tests.yml
+++ b/.github/workflows/build-tests.yml
@@ -7,22 +7,48 @@ on:
 
 jobs:
   build_feature:
-    name: Build Tests
+    name: Test ${{ matrix.name }}
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
-        os: [ ubuntu-latest, macos-latest, windows-latest ]
         include:
           - os: ubuntu-latest
             name: linux
+            runTest: true
             testExe: build/sst-basic-blocks-test
+
           - os: macos-latest
-            name: mac
+            name: mac-x86
+            runTest: true
             testExe: build/sst-basic-blocks-test
+            cmakeArgs: -DCMAKE_OSX_ARCHITECTURES=x86_64
+
+          - os: macos-latest
+            name: mac-arm
+            cmakeArgs: -DCMAKE_OSX_ARCHITECTURES=arm64
+
+          - os: macos-latest
+            name: mac-arm-nonative
+            cmakeArgs: -DCMAKE_OSX_ARCHITECTURES=arm64 -DSST_BASIC_BLOCKS_SIMD_OMIT_NATIVE_ALIASES=TRUE
+
           - os: windows-latest
-            name: win
+            name: win-x86
+            runTest: true
             testExe: build/Release/sst-basic-blocks-test.exe
 
+          - os: windows-latest
+            name: win-arm64
+            cmakeArgs: -G"Visual Studio 17 2022" -A arm64 -DCMAKE_SYSTEM_VERSION=10
+
+          - os: windows-latest
+            name: win-arm64ec
+            cmakeArgs: -G"Visual Studio 17 2022" -A arm64ec -DCMAKE_SYSTEM_VERSION=10
+
+          - os: windows-latest
+            name: win-arm64-non-native
+            cmakeArgs: -G"Visual Studio 17 2022" -A arm64 -DCMAKE_SYSTEM_VERSION=10 -DSST_BASIC_BLOCKS_SIMD_OMIT_NATIVE_ALIASES=TRUE
+
     steps:
 
       - name: Checkout code
@@ -32,10 +58,11 @@ jobs:
 
       - name: Build Smoke test
         run: |
-          cmake -S . -B ./build -DCMAKE_BUILD_TYPE=Release -DSST_BASIC_BLOCKS_BUILD_TESTS=TRUE -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
+          cmake -S . -B ./build -DCMAKE_BUILD_TYPE=Release ${{ matrix.cmakeArgs }} -DSST_BASIC_BLOCKS_BUILD_TESTS=TRUE 
           cmake --build ./build --config Release
 
       - name: Run Smoke Test
+        if: ${{ matrix.runTest }}
         run: |
           ls ${{ matrix.testExe }}
           ${{ matrix.testExe }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6814ab6..a8a0b60 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,16 @@ set(CMAKE_CXX_STANDARD 17)
 
 add_library(${PROJECT_NAME} INTERFACE)
 target_include_directories(${PROJECT_NAME} INTERFACE include)
-target_compile_definitions(${PROJECT_NAME} INTERFACE _USE_MATH_DEFINES=1)
+
+if (${SST_BASIC_BLOCKS_SIMD_OMIT_NATIVE_ALIASES})
+    message(STATUS "CMake Omitting Native Aliases")
+    target_compile_definitions(${PROJECT_NAME} INTERFACE SST_SIMD_OMIT_NATIVE_ALIASES=1)
+endif()
+
+if (WIN32)
+    target_compile_definitions(${PROJECT_NAME} INTERFACE _USE_MATH_DEFINES=1)
+    target_compile_definitions(${PROJECT_NAME} INTERFACE NOMINMAX)
+endif()
 
 if (${SST_BASIC_BLOCKS_BUILD_TESTS})
     include(cmake/CPM.cmake)
@@ -52,4 +61,10 @@ if (${SST_BASIC_BLOCKS_BUILD_TESTS})
         message(STATUS "Keeping Catch exception handling on for more modern macOS")
     endif()
 
+else()
+    if (NOT TARGET simde)
+        message(WARNING "SST Basic Blocks requires access to the 'simde' target from "
+                "https://github.com/simde-everywhere/simde. This build will only work on x86_64 architecture.")
+        target_compile_definitions(${PROJECT_NAME} SIMDE_UNAVAILABLE=1)
+    endif()
 endif ()
diff --git a/include/sst/basic-blocks/dsp/BlockInterpolators.h b/include/sst/basic-blocks/dsp/BlockInterpolators.h
index 622e18a..e3f7427 100644
--- a/include/sst/basic-blocks/dsp/BlockInterpolators.h
+++ b/include/sst/basic-blocks/dsp/BlockInterpolators.h
@@ -28,6 +28,7 @@
 #define INCLUDE_SST_BASIC_BLOCKS_DSP_BLOCKINTERPOLATORS_H
 
 #include <cassert>
+#include "sst/basic-blocks/simd/setup.h"
 
 namespace sst::basic_blocks::dsp
 {
@@ -79,9 +80,9 @@ template <int maxBlockSize, bool first_run_checks = true> struct alignas(16) lip
 {
   private:
     // put these at the top to preserve alignment
-    __m128 line[maxBlockSize >> 2];
-    __m128 zeroUpByQuarters;
-    __m128 one, zero;
+    SIMD_M128 line[maxBlockSize >> 2];
+    SIMD_M128 zeroUpByQuarters;
+    SIMD_M128 one, zero;
 
   public:
     static constexpr int maxRegisters{maxBlockSize >> 2};
@@ -99,9 +100,9 @@ template <int maxBlockSize, bool first_run_checks = true> struct alignas(16) lip
     lipol_sse()
     {
         float zbq alignas(16)[4]{0.25f, 0.5f, 0.75f, 1.00f};
-        zeroUpByQuarters = _mm_load_ps(zbq);
-        one = _mm_set1_ps(1.f);
-        zero = _mm_setzero_ps();
+        zeroUpByQuarters = SIMD_MM(load_ps)(zbq);
+        one = SIMD_MM(set1_ps)(1.f);
+        zero = SIMD_MM(setzero_ps)();
     }
     void set_target(float f)
     {
@@ -152,9 +153,9 @@ template <int maxBlockSize, bool first_run_checks = true> struct alignas(16) lip
         assert(bsQuad == -1 || bsQuad == numRegisters);
         for (int i = 0; i < numRegisters; ++i)
         {
-            auto iv = _mm_load_ps(in + (i << 2));
-            auto ov = _mm_mul_ps(iv, line[i]);
-            _mm_store_ps(out + (i << 2), ov);
+            auto iv = SIMD_MM(load_ps)(in + (i << 2));
+            auto ov = SIMD_MM(mul_ps)(iv, line[i]);
+            SIMD_MM(store_ps)(out + (i << 2), ov);
         }
     }
 
@@ -163,9 +164,9 @@ template <int maxBlockSize, bool first_run_checks = true> struct alignas(16) lip
         assert(bsQuad == -1 || bsQuad == numRegisters);
         for (int i = 0; i < numRegisters; ++i)
         {
-            auto iv = _mm_load_ps(in + (i << 2));
-            auto ov = _mm_mul_ps(iv, line[i]);
-            _mm_store_ps(in + (i << 2), ov);
+            auto iv = SIMD_MM(load_ps)(in + (i << 2));
+            auto ov = SIMD_MM(mul_ps)(iv, line[i]);
+            SIMD_MM(store_ps)(in + (i << 2), ov);
         }
     }
 
@@ -190,11 +191,11 @@ template <int maxBlockSize, bool first_run_checks = true> struct alignas(16) lip
         assert(bsQuad == -1 || bsQuad == numRegisters);
         for (int i = 0; i < numRegisters; ++i)
         {
-            auto iv = _mm_load_ps(src + (i << 2));
-            auto dv = _mm_load_ps(dst + (i << 2));
-            auto ov = _mm_mul_ps(iv, line[i]);
-            auto mv = _mm_add_ps(ov, dv);
-            _mm_store_ps(dst + (i << 2), mv);
+            auto iv = SIMD_MM(load_ps)(src + (i << 2));
+            auto dv = SIMD_MM(load_ps)(dst + (i << 2));
+            auto ov = SIMD_MM(mul_ps)(iv, line[i]);
+            auto mv = SIMD_MM(add_ps)(ov, dv);
+            SIMD_MM(store_ps)(dst + (i << 2), mv);
         }
     }
     void MAC_2_blocks_to(float *__restrict src1, float *__restrict src2, float *__restrict dst1,
@@ -211,12 +212,12 @@ template <int maxBlockSize, bool first_run_checks = true> struct alignas(16) lip
     {
         for (int i = 0; i < numRegisters; ++i)
         {
-            auto a = _mm_load_ps(inA + (i << 2));
-            auto b = _mm_load_ps(inB + (i << 2));
-            auto sa = _mm_mul_ps(a, _mm_sub_ps(one, line[i]));
-            auto sb = _mm_mul_ps(b, line[i]);
-            auto r = _mm_add_ps(sa, sb);
-            _mm_store_ps(out + (i << 2), r);
+            auto a = SIMD_MM(load_ps)(inA + (i << 2));
+            auto b = SIMD_MM(load_ps)(inB + (i << 2));
+            auto sa = SIMD_MM(mul_ps)(a, SIMD_MM(sub_ps)(one, line[i]));
+            auto sb = SIMD_MM(mul_ps)(b, line[i]);
+            auto r = SIMD_MM(add_ps)(sa, sb);
+            SIMD_MM(store_ps)(out + (i << 2), r);
         }
     }
 
@@ -238,12 +239,12 @@ template <int maxBlockSize, bool first_run_checks = true> struct alignas(16) lip
     {
         for (int i = 0; i < numRegisters; ++i)
         {
-            auto a = _mm_load_ps(inAOut + (i << 2));
-            auto b = _mm_load_ps(inB + (i << 2));
-            auto sa = _mm_mul_ps(a, _mm_sub_ps(one, line[i]));
-            auto sb = _mm_mul_ps(b, line[i]);
-            auto r = _mm_add_ps(sa, sb);
-            _mm_store_ps(inAOut + (i << 2), r);
+            auto a = SIMD_MM(load_ps)(inAOut + (i << 2));
+            auto b = SIMD_MM(load_ps)(inB + (i << 2));
+            auto sa = SIMD_MM(mul_ps)(a, SIMD_MM(sub_ps)(one, line[i]));
+            auto sb = SIMD_MM(mul_ps)(b, line[i]);
+            auto r = SIMD_MM(add_ps)(sa, sb);
+            SIMD_MM(store_ps)(inAOut + (i << 2), r);
         }
     }
 
@@ -261,7 +262,7 @@ template <int maxBlockSize, bool first_run_checks = true> struct alignas(16) lip
         assert(bsQuad == -1 || bsQuad == numRegisters);
         for (int i = 0; i < numRegisters; ++i)
         {
-            _mm_store_ps(out + (i << 2), line[i]);
+            SIMD_MM(store_ps)(out + (i << 2), line[i]);
         }
     }
 
@@ -279,14 +280,16 @@ template <int maxBlockSize, bool first_run_checks = true> struct alignas(16) lip
 
         for (int i = 0; i < numRegisters; ++i)
         {
-            auto a = _mm_max_ps(zero, line[i]);
-            auto b = _mm_min_ps(zero, line[i]);
-            auto l = _mm_load_ps(L + (i << 2));
-            auto r = _mm_load_ps(R + (i << 2));
-            auto tl = _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(one, a), l), _mm_mul_ps(b, r));
-            auto tr = _mm_add_ps(_mm_mul_ps(a, l), _mm_mul_ps(_mm_add_ps(one, b), r));
-            _mm_store_ps(dL + (i << 2), tl);
-            _mm_store_ps(dR + (i << 2), tr);
+            auto a = SIMD_MM(max_ps)(zero, line[i]);
+            auto b = SIMD_MM(min_ps)(zero, line[i]);
+            auto l = SIMD_MM(load_ps)(L + (i << 2));
+            auto r = SIMD_MM(load_ps)(R + (i << 2));
+            auto tl =
+                SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(one, a), l), SIMD_MM(mul_ps)(b, r));
+            auto tr =
+                SIMD_MM(add_ps)(SIMD_MM(mul_ps)(a, l), SIMD_MM(mul_ps)(SIMD_MM(add_ps)(one, b), r));
+            SIMD_MM(store_ps)(dL + (i << 2), tl);
+            SIMD_MM(store_ps)(dR + (i << 2), tr);
         }
     }
 
@@ -304,13 +307,13 @@ template <int maxBlockSize, bool first_run_checks = true> struct alignas(16) lip
   private:
     void updateLine()
     {
-        auto cs = _mm_set1_ps(current);
-        auto dy0 = _mm_set1_ps((target - current) * registerSizeInv);
-        auto dy = _mm_mul_ps(dy0, zeroUpByQuarters);
+        auto cs = SIMD_MM(set1_ps)(current);
+        auto dy0 = SIMD_MM(set1_ps)((target - current) * registerSizeInv);
+        auto dy = SIMD_MM(mul_ps)(dy0, zeroUpByQuarters);
         for (int i = 0; i < numRegisters; ++i)
         {
-            line[i] = _mm_add_ps(cs, dy);
-            dy = _mm_add_ps(dy, dy0);
+            line[i] = SIMD_MM(add_ps)(cs, dy);
+            dy = SIMD_MM(add_ps)(dy, dy0);
         }
         current = target;
     }
diff --git a/include/sst/basic-blocks/dsp/Clippers.h b/include/sst/basic-blocks/dsp/Clippers.h
index b5bf4a9..04c048c 100644
--- a/include/sst/basic-blocks/dsp/Clippers.h
+++ b/include/sst/basic-blocks/dsp/Clippers.h
@@ -27,24 +27,26 @@
 #ifndef INCLUDE_SST_BASIC_BLOCKS_DSP_CLIPPERS_H
 #define INCLUDE_SST_BASIC_BLOCKS_DSP_CLIPPERS_H
 
+#include "sst/basic-blocks/simd/setup.h"
+
 namespace sst::basic_blocks::dsp
 {
 
 /**
  * y = x - (4/27)*x^3,  x in [-1.5 .. 1.5], +/-1 otherwise
  */
-inline __m128 softclip_ps(__m128 in)
+inline SIMD_M128 softclip_ps(SIMD_M128 in)
 {
-    const __m128 a = _mm_set1_ps(-4.f / 27.f);
+    const auto a = SIMD_MM(set1_ps)(-4.f / 27.f);
 
-    const __m128 x_min = _mm_set1_ps(-1.5f);
-    const __m128 x_max = _mm_set1_ps(1.5f);
+    const auto x_min = SIMD_MM(set1_ps)(-1.5f);
+    const auto x_max = SIMD_MM(set1_ps)(1.5f);
 
-    __m128 x = _mm_max_ps(_mm_min_ps(in, x_max), x_min);
-    __m128 xx = _mm_mul_ps(x, x);
-    __m128 t = _mm_mul_ps(x, a);
-    t = _mm_mul_ps(t, xx);
-    t = _mm_add_ps(t, x);
+    auto x = SIMD_MM(max_ps)(SIMD_MM(min_ps)(in, x_max), x_min);
+    auto xx = SIMD_MM(mul_ps)(x, x);
+    auto t = SIMD_MM(mul_ps)(x, a);
+    t = SIMD_MM(mul_ps)(t, xx);
+    t = SIMD_MM(add_ps)(t, x);
 
     return t;
 }
@@ -52,7 +54,7 @@ inline __m128 softclip_ps(__m128 in)
 /**
  * y = x - (4/27/8^3)*x^3,  x in [-12 .. 12], +/-12 otherwise
  */
-inline __m128 softclip8_ps(__m128 in)
+inline SIMD_M128 softclip8_ps(SIMD_M128 in)
 {
     /*
      * This constant is - 4/27 / 8^3 so it "scales" the
@@ -61,44 +63,44 @@ inline __m128 softclip8_ps(__m128 in)
      * But this is only used in one spot - in LPMOOGquad - so
      * we will just leave it for now
      */
-    const __m128 a = _mm_set1_ps(-0.00028935185185f);
+    const auto a = SIMD_MM(set1_ps)(-0.00028935185185f);
 
-    const __m128 x_min = _mm_set1_ps(-12.f);
-    const __m128 x_max = _mm_set1_ps(12.f);
+    const auto x_min = SIMD_MM(set1_ps)(-12.f);
+    const auto x_max = SIMD_MM(set1_ps)(12.f);
 
-    __m128 x = _mm_max_ps(_mm_min_ps(in, x_max), x_min);
-    __m128 xx = _mm_mul_ps(x, x);
-    __m128 t = _mm_mul_ps(x, a);
-    t = _mm_mul_ps(t, xx);
-    t = _mm_add_ps(t, x);
+    auto x = SIMD_MM(max_ps)(SIMD_MM(min_ps)(in, x_max), x_min);
+    auto xx = SIMD_MM(mul_ps)(x, x);
+    auto t = SIMD_MM(mul_ps)(x, a);
+    t = SIMD_MM(mul_ps)(t, xx);
+    t = SIMD_MM(add_ps)(t, x);
     return t;
 }
 
-inline __m128 tanh7_ps(__m128 v)
+inline SIMD_M128 tanh7_ps(SIMD_M128 v)
 {
-    const __m128 upper_bound = _mm_set1_ps(1.139f);
-    const __m128 lower_bound = _mm_set1_ps(-1.139f);
-    auto x = _mm_max_ps(v, lower_bound);
-    x = _mm_min_ps(x, upper_bound);
-
-    const __m128 a = _mm_set1_ps(-1.f / 3.f);
-    const __m128 b = _mm_set1_ps(2.f / 15.f);
-    const __m128 c = _mm_set1_ps(-17.f / 315.f);
-    const __m128 one = _mm_set1_ps(1.f);
-    __m128 xx = _mm_mul_ps(x, x);
-    __m128 y = _mm_add_ps(one, _mm_mul_ps(a, xx));
-    __m128 x4 = _mm_mul_ps(xx, xx);
-    y = _mm_add_ps(y, _mm_mul_ps(b, x4));
-    x4 = _mm_mul_ps(x4, xx);
-    y = _mm_add_ps(y, _mm_mul_ps(c, x4));
-    return _mm_mul_ps(y, x);
+    const auto upper_bound = SIMD_MM(set1_ps)(1.139f);
+    const auto lower_bound = SIMD_MM(set1_ps)(-1.139f);
+    auto x = SIMD_MM(max_ps)(v, lower_bound);
+    x = SIMD_MM(min_ps)(x, upper_bound);
+
+    const auto a = SIMD_MM(set1_ps)(-1.f / 3.f);
+    const auto b = SIMD_MM(set1_ps)(2.f / 15.f);
+    const auto c = SIMD_MM(set1_ps)(-17.f / 315.f);
+    const auto one = SIMD_MM(set1_ps)(1.f);
+    auto xx = SIMD_MM(mul_ps)(x, x);
+    auto y = SIMD_MM(add_ps)(one, SIMD_MM(mul_ps)(a, xx));
+    auto x4 = SIMD_MM(mul_ps)(xx, xx);
+    y = SIMD_MM(add_ps)(y, SIMD_MM(mul_ps)(b, x4));
+    x4 = SIMD_MM(mul_ps)(x4, xx);
+    y = SIMD_MM(add_ps)(y, SIMD_MM(mul_ps)(c, x4));
+    return SIMD_MM(mul_ps)(y, x);
 }
 
 template <size_t blockSize> void softclip_block(float *__restrict x)
 {
     for (unsigned int i = 0; i < blockSize; i += 4)
     {
-        _mm_store_ps(x + i, softclip_ps(_mm_load_ps(x + i)));
+        SIMD_MM(store_ps)(x + i, softclip_ps(SIMD_MM(load_ps)(x + i)));
     }
 }
 
@@ -106,29 +108,31 @@ template <size_t blockSize> void tanh7_block(float *__restrict x)
 {
     for (unsigned int i = 0; i < blockSize; i += 4)
     {
-        _mm_store_ps(x + i, tanh7_ps(_mm_load_ps(x + i)));
+        SIMD_MM(store_ps)(x + i, tanh7_ps(SIMD_MM(load_ps)(x + i)));
     }
 }
 
 template <size_t blockSize> void hardclip_block(float *x)
 {
     static_assert(!(blockSize & (blockSize - 1)) && blockSize >= 4);
-    const __m128 x_min = _mm_set1_ps(-1.0f);
-    const __m128 x_max = _mm_set1_ps(1.0f);
+    const auto x_min = SIMD_MM(set1_ps)(-1.0f);
+    const auto x_max = SIMD_MM(set1_ps)(1.0f);
     for (unsigned int i = 0; i < blockSize; i += 4)
     {
-        _mm_store_ps(x + i, _mm_max_ps(_mm_min_ps(_mm_load_ps(x + i), x_max), x_min));
+        SIMD_MM(store_ps)
+        (x + i, SIMD_MM(max_ps)(SIMD_MM(min_ps)(SIMD_MM(load_ps)(x + i), x_max), x_min));
     }
 }
 
 template <size_t blockSize> void hardclip_block8(float *x)
 {
     static_assert(!(blockSize & (blockSize - 1)) && blockSize >= 4);
-    const __m128 x_min = _mm_set1_ps(-8.0f);
-    const __m128 x_max = _mm_set1_ps(8.0f);
+    const auto x_min = SIMD_MM(set1_ps)(-8.0f);
+    const auto x_max = SIMD_MM(set1_ps)(8.0f);
     for (unsigned int i = 0; i < blockSize; i += 4)
     {
-        _mm_store_ps(x + i, _mm_max_ps(_mm_min_ps(_mm_load_ps(x + i), x_max), x_min));
+        SIMD_MM(store_ps)
+        (x + i, SIMD_MM(max_ps)(SIMD_MM(min_ps)(SIMD_MM(load_ps)(x + i), x_max), x_min));
     }
 }
 } // namespace sst::basic_blocks::dsp
diff --git a/include/sst/basic-blocks/dsp/CorrelatedNoise.h b/include/sst/basic-blocks/dsp/CorrelatedNoise.h
index 382a825..d952767 100644
--- a/include/sst/basic-blocks/dsp/CorrelatedNoise.h
+++ b/include/sst/basic-blocks/dsp/CorrelatedNoise.h
@@ -29,6 +29,7 @@
 
 #include <functional>
 #include <cmath>
+#include "sst/basic-blocks/simd/setup.h"
 
 namespace sst::basic_blocks::dsp
 {
@@ -48,8 +49,8 @@ inline float correlated_noise_o2mk2_supplied_value(float &lastval, float &lastva
         wf = -wfabs;
     float m = 1.f - wfabs;
     // float m = 1.f/sqrt(1.f-wfabs);
-    auto m1 = _mm_rsqrt_ss(_mm_load_ss(&m));
-    _mm_store_ss(&m, m1);
+    auto m1 = SIMD_MM(rsqrt_ss)(SIMD_MM(load_ss)(&m));
+    SIMD_MM(store_ss)(&m, m1);
     // if (wf>0.f) m *= 1 + wf*8;
 
     float rand11 = bipolarUniformRandValue;
diff --git a/include/sst/basic-blocks/dsp/FastMath.h b/include/sst/basic-blocks/dsp/FastMath.h
index 1db63e4..2ab6f5b 100644
--- a/include/sst/basic-blocks/dsp/FastMath.h
+++ b/include/sst/basic-blocks/dsp/FastMath.h
@@ -28,6 +28,7 @@
 #define INCLUDE_SST_BASIC_BLOCKS_DSP_FASTMATH_H
 
 #include <cmath>
+#include "sst/basic-blocks/simd/setup.h"
 
 /*
 ** Fast Math Approximations to various Functions
@@ -58,13 +59,13 @@ inline float fastsin(float x) noexcept
     return numerator / denominator;
 }
 
-inline __m128 fastsinSSE(__m128 x) noexcept
+inline SIMD_M128 fastsinSSE(SIMD_M128 x) noexcept
 {
-#define M(a, b) _mm_mul_ps(a, b)
-#define A(a, b) _mm_add_ps(a, b)
-#define S(a, b) _mm_sub_ps(a, b)
-#define F(a) _mm_set_ps1(a)
-#define C(x) __m128 m##x = F((float)x)
+#define M(a, b) SIMD_MM(mul_ps)(a, b)
+#define A(a, b) SIMD_MM(add_ps)(a, b)
+#define S(a, b) SIMD_MM(sub_ps)(a, b)
+#define F(a) SIMD_MM(set_ps1)(a)
+#define C(x) auto m##x = F((float)x)
 
     /*
     auto numerator = -x * (-(float)11511339840 +
@@ -92,7 +93,7 @@ inline __m128 fastsinSSE(__m128 x) noexcept
 #undef A
 #undef S
 #undef F
-    return _mm_div_ps(num, den);
+    return SIMD_MM(div_ps)(num, den);
 }
 
 // JUCE6 Pade approximation of cos valid from -PI to PI with max error of 1e-5 and average error of
@@ -105,13 +106,13 @@ inline float fastcos(float x) noexcept
     return numerator / denominator;
 }
 
-inline __m128 fastcosSSE(__m128 x) noexcept
+inline SIMD_M128 fastcosSSE(SIMD_M128 x) noexcept
 {
-#define M(a, b) _mm_mul_ps(a, b)
-#define A(a, b) _mm_add_ps(a, b)
-#define S(a, b) _mm_sub_ps(a, b)
-#define F(a) _mm_set_ps1(a)
-#define C(x) __m128 m##x = F((float)x)
+#define M(a, b) SIMD_MM(mul_ps)(a, b)
+#define A(a, b) SIMD_MM(add_ps)(a, b)
+#define S(a, b) SIMD_MM(sub_ps)(a, b)
+#define F(a) SIMD_MM(set_ps1)(a)
+#define C(x) auto m##x = F((float)x)
 
     // auto x2 = x * x;
     auto x2 = M(x, x);
@@ -134,7 +135,7 @@ inline __m128 fastcosSSE(__m128 x) noexcept
 #undef A
 #undef S
 #undef F
-    return _mm_div_ps(num, den);
+    return SIMD_MM(div_ps)(num, den);
 }
 
 /*
@@ -156,20 +157,20 @@ inline float clampToPiRange(float x)
     return p - M_PI;
 }
 
-inline __m128 clampToPiRangeSSE(__m128 x)
+inline SIMD_M128 clampToPiRangeSSE(SIMD_M128 x)
 {
-    const auto mpi = _mm_set1_ps(M_PI);
-    const auto m2pi = _mm_set1_ps(2.0 * M_PI);
-    const auto oo2p = _mm_set1_ps(1.0 / (2.0 * M_PI));
-    const auto mz = _mm_setzero_ps();
-
-    auto y = _mm_add_ps(x, mpi);
-    auto yip = _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_mul_ps(y, oo2p)));
-    auto p = _mm_sub_ps(y, _mm_mul_ps(m2pi, yip));
-    auto off = _mm_and_ps(_mm_cmplt_ps(p, mz), m2pi);
-    p = _mm_add_ps(p, off);
-
-    return _mm_sub_ps(p, mpi);
+    const auto mpi = SIMD_MM(set1_ps)(M_PI);
+    const auto m2pi = SIMD_MM(set1_ps)(2.0 * M_PI);
+    const auto oo2p = SIMD_MM(set1_ps)(1.0 / (2.0 * M_PI));
+    const auto mz = SIMD_MM(setzero_ps)();
+
+    auto y = SIMD_MM(add_ps)(x, mpi);
+    auto yip = SIMD_MM(cvtepi32_ps)(SIMD_MM(cvttps_epi32)(SIMD_MM(mul_ps)(y, oo2p)));
+    auto p = SIMD_MM(sub_ps)(y, SIMD_MM(mul_ps)(m2pi, yip));
+    auto off = SIMD_MM(and_ps)(SIMD_MM(cmplt_ps)(p, mz), m2pi);
+    p = SIMD_MM(add_ps)(p, off);
+
+    return SIMD_MM(sub_ps)(p, mpi);
 }
 
 /*
@@ -192,14 +193,14 @@ inline float fasttan(float x) noexcept
     return numerator / denominator;
 }
 
-inline __m128 fasttanhSSE(__m128 x)
+inline SIMD_M128 fasttanhSSE(SIMD_M128 x)
 {
-    const __m128 m135135 = _mm_set_ps1(135135), m17325 = _mm_set_ps1(17325),
-                 m378 = _mm_set_ps1(378), m62370 = _mm_set_ps1(62370), m3150 = _mm_set_ps1(3150),
-                 m28 = _mm_set_ps1(28);
+    const auto m135135 = SIMD_MM(set_ps1)(135135), m17325 = SIMD_MM(set_ps1)(17325),
+               m378 = SIMD_MM(set_ps1)(378), m62370 = SIMD_MM(set_ps1)(62370),
+               m3150 = SIMD_MM(set_ps1)(3150), m28 = SIMD_MM(set_ps1)(28);
 
-#define M(a, b) _mm_mul_ps(a, b)
-#define A(a, b) _mm_add_ps(a, b)
+#define M(a, b) SIMD_MM(mul_ps)(a, b)
+#define A(a, b) SIMD_MM(add_ps)(a, b)
 
     auto x2 = M(x, x);
     auto num = M(x, A(m135135, M(x2, A(m17325, M(x2, A(m378, x2))))));
@@ -208,12 +209,12 @@ inline __m128 fasttanhSSE(__m128 x)
 #undef M
 #undef A
 
-    return _mm_div_ps(num, den);
+    return SIMD_MM(div_ps)(num, den);
 }
 
-inline __m128 fasttanhSSEclamped(__m128 x)
+inline SIMD_M128 fasttanhSSEclamped(SIMD_M128 x)
 {
-    auto xc = _mm_min_ps(_mm_set_ps1(5), _mm_max_ps(_mm_set_ps1(-5), x));
+    auto xc = SIMD_MM(min_ps)(SIMD_MM(set_ps1)(5), SIMD_MM(max_ps)(SIMD_MM(set_ps1)(-5), x));
     return fasttanhSSE(xc);
 }
 
@@ -227,19 +228,19 @@ inline float fastexp(float x) noexcept
     return numerator / denominator;
 }
 
-inline __m128 fastexpSSE(__m128 x) noexcept
+inline SIMD_M128 fastexpSSE(SIMD_M128 x) noexcept
 {
-#define M(a, b) _mm_mul_ps(a, b)
-#define A(a, b) _mm_add_ps(a, b)
-#define F(a) _mm_set_ps1(a)
+#define M(a, b) SIMD_MM(mul_ps)(a, b)
+#define A(a, b) SIMD_MM(add_ps)(a, b)
+#define F(a) SIMD_MM(set_ps1)(a)
 
-    const __m128 m1680 = F(1680), m840 = F(840), mneg840 = F(-840), m180 = F(180), m20 = F(20),
-                 mneg20 = F(-20);
+    const auto m1680 = F(1680), m840 = F(840), mneg840 = F(-840), m180 = F(180), m20 = F(20),
+               mneg20 = F(-20);
 
     auto num = A(m1680, M(x, A(m840, M(x, A(m180, M(x, A(m20, x)))))));
     auto den = A(m1680, M(x, A(mneg840, M(x, A(m180, M(x, A(mneg20, x)))))));
 
-    return _mm_div_ps(num, den);
+    return SIMD_MM(div_ps)(num, den);
 
 #undef M
 #undef A
diff --git a/include/sst/basic-blocks/dsp/HilbertTransform.h b/include/sst/basic-blocks/dsp/HilbertTransform.h
index 5ad5ea2..d6a346a 100644
--- a/include/sst/basic-blocks/dsp/HilbertTransform.h
+++ b/include/sst/basic-blocks/dsp/HilbertTransform.h
@@ -59,6 +59,8 @@
 #include <utility>
 #include <complex>
 
+#include "sst/basic-blocks/simd/setup.h"
+
 namespace sst::basic_blocks::dsp
 {
 struct HilbertTransformMonoFloat
@@ -171,18 +173,18 @@ struct HilbertTransformStereoSSE
      */
     struct BQ
     {
-        __m128 a1{1}, a2{0}, b0{1}, b1{0}, b2{0}, reg0{0}, reg1{0};
+        SIMD_M128 a1{1}, a2{0}, b0{1}, b1{0}, b2{0}, reg0{0}, reg1{0};
         inline void reset()
         {
-            reg0 = _mm_setzero_ps();
-            reg1 = _mm_setzero_ps();
+            reg0 = SIMD_MM(setzero_ps)();
+            reg1 = SIMD_MM(setzero_ps)();
         }
-        inline void setOne(int idx, __m128 &on, float f)
+        inline void setOne(int idx, SIMD_M128 &on, float f)
         {
             float r alignas(16)[4];
-            _mm_store_ps(r, on);
+            SIMD_MM(store_ps)(r, on);
             r[idx] = f;
-            on = _mm_load_ps(r);
+            on = SIMD_MM(load_ps)(r);
         }
         inline void setCoefs(int idx, float _a1, float _a2, float _b0, float _b1, float _b2)
         {
@@ -193,11 +195,12 @@ struct HilbertTransformStereoSSE
             setOne(idx, b2, _b2);
         }
 
-        inline __m128 step(__m128 input)
+        inline SIMD_M128 step(SIMD_M128 input)
         {
-            auto op = _mm_add_ps(_mm_mul_ps(input, b0), reg0);
-            reg0 = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(input, b1), _mm_mul_ps(a1, op)), reg1);
-            reg1 = _mm_sub_ps(_mm_mul_ps(input, b2), _mm_mul_ps(a2, op));
+            auto op = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(input, b0), reg0);
+            reg0 = SIMD_MM(add_ps)(
+                SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(input, b1), SIMD_MM(mul_ps)(a1, op)), reg1);
+            reg1 = SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(input, b2), SIMD_MM(mul_ps)(a2, op));
 
             return op;
         }
@@ -260,10 +263,10 @@ struct HilbertTransformStereoSSE
     }
 
     // Returns reL, imL, reR, imR
-    __m128 stepStereo(float L, float R)
+    SIMD_M128 stepStereo(float L, float R)
     {
         float r alignas(16)[4]{L, L, R, R};
-        auto in = _mm_load_ps(r);
+        auto in = SIMD_MM(load_ps)(r);
         for (int i = 0; i < 3; ++i)
         {
             in = allpassSSE[i].step(in);
@@ -275,7 +278,7 @@ struct HilbertTransformStereoSSE
     {
         auto v = stepStereo(L, R);
         float r alignas(16)[4];
-        _mm_store_ps(r, v);
+        SIMD_MM(store_ps)(r, v);
         return {{r[0], r[1]}, {r[2], r[3]}};
     }
 
@@ -283,7 +286,7 @@ struct HilbertTransformStereoSSE
     {
         auto v = stepStereo(L, R);
         float r alignas(16)[4];
-        _mm_store_ps(r, v);
+        SIMD_MM(store_ps)(r, v);
         return {{r[0], r[1]}, {r[2], r[3]}};
     }
 };
diff --git a/include/sst/basic-blocks/dsp/LanczosResampler.h b/include/sst/basic-blocks/dsp/LanczosResampler.h
index e248be5..9c131c9 100644
--- a/include/sst/basic-blocks/dsp/LanczosResampler.h
+++ b/include/sst/basic-blocks/dsp/LanczosResampler.h
@@ -50,6 +50,7 @@
 #include <utility>
 #include <cmath>
 #include <cstring>
+#include "sst/basic-blocks/simd/setup.h"
 #include "sst/basic-blocks/mechanics/simd-ops.h"
 
 namespace sst::basic_blocks::dsp
@@ -168,24 +169,24 @@ template <int blockSize> struct LanczosResampler
         int tidx = (int)(off0byto);
         double fidx = (off0byto - tidx);
 
-        auto fl = _mm_set1_ps((float)fidx);
-        auto f0 = _mm_load_ps(&lanczosTable[tidx][0]);
-        auto df0 = _mm_load_ps(&lanczosTableDX[tidx][0]);
+        auto fl = SIMD_MM(set1_ps)((float)fidx);
+        auto f0 = SIMD_MM(load_ps)(&lanczosTable[tidx][0]);
+        auto df0 = SIMD_MM(load_ps)(&lanczosTableDX[tidx][0]);
 
-        f0 = _mm_add_ps(f0, _mm_mul_ps(df0, fl));
+        f0 = SIMD_MM(add_ps)(f0, SIMD_MM(mul_ps)(df0, fl));
 
-        auto f1 = _mm_load_ps(&lanczosTable[tidx][4]);
-        auto df1 = _mm_load_ps(&lanczosTableDX[tidx][4]);
-        f1 = _mm_add_ps(f1, _mm_mul_ps(df1, fl));
+        auto f1 = SIMD_MM(load_ps)(&lanczosTable[tidx][4]);
+        auto df1 = SIMD_MM(load_ps)(&lanczosTableDX[tidx][4]);
+        f1 = SIMD_MM(add_ps)(f1, SIMD_MM(mul_ps)(df1, fl));
 
-        auto d0 = _mm_loadu_ps(&input[0][idx0 - A]);
-        auto d1 = _mm_loadu_ps(&input[0][idx0]);
-        auto rv = _mm_add_ps(_mm_mul_ps(f0, d0), _mm_mul_ps(f1, d1));
+        auto d0 = SIMD_MM(loadu_ps)(&input[0][idx0 - A]);
+        auto d1 = SIMD_MM(loadu_ps)(&input[0][idx0]);
+        auto rv = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f0, d0), SIMD_MM(mul_ps)(f1, d1));
         L = mechanics::sum_ps_to_float(rv);
 
-        d0 = _mm_loadu_ps(&input[1][idx0 - A]);
-        d1 = _mm_loadu_ps(&input[1][idx0]);
-        rv = _mm_add_ps(_mm_mul_ps(f0, d0), _mm_mul_ps(f1, d1));
+        d0 = SIMD_MM(loadu_ps)(&input[1][idx0 - A]);
+        d1 = SIMD_MM(loadu_ps)(&input[1][idx0]);
+        rv = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f0, d0), SIMD_MM(mul_ps)(f1, d1));
         R = mechanics::sum_ps_to_float(rv);
     }
 
diff --git a/include/sst/basic-blocks/dsp/SSESincDelayLine.h b/include/sst/basic-blocks/dsp/SSESincDelayLine.h
index 5220d56..8a53cff 100644
--- a/include/sst/basic-blocks/dsp/SSESincDelayLine.h
+++ b/include/sst/basic-blocks/dsp/SSESincDelayLine.h
@@ -27,6 +27,7 @@
 #ifndef INCLUDE_SST_BASIC_BLOCKS_DSP_SSESINCDELAYLINE_H
 #define INCLUDE_SST_BASIC_BLOCKS_DSP_SSESINCDELAYLINE_H
 
+#include "sst/basic-blocks/simd/setup.h"
 #include "sst/basic-blocks/mechanics/simd-ops.h"
 #include "sst/basic-blocks/tables/SincTableProvider.h"
 
@@ -80,20 +81,20 @@ struct SSESincDelayLine
         int readPtr = (wp - iDelay - (stp::FIRipol_N >> 1)) & (COMB_SIZE - 1);
 
         // And so now do what we do in COMBSSE2Quad
-        __m128 a = _mm_loadu_ps(&buffer[readPtr]);
-        __m128 b = _mm_loadu_ps(&sinctable[sincTableOffset]);
-        __m128 o = _mm_mul_ps(a, b);
+        auto a = SIMD_MM(loadu_ps)(&buffer[readPtr]);
+        auto b = SIMD_MM(loadu_ps)(&sinctable[sincTableOffset]);
+        auto o = SIMD_MM(mul_ps)(a, b);
 
-        a = _mm_loadu_ps(&buffer[readPtr + 4]);
-        b = _mm_loadu_ps(&sinctable[sincTableOffset + 4]);
-        o = _mm_add_ps(o, _mm_mul_ps(a, b));
+        a = SIMD_MM(loadu_ps)(&buffer[readPtr + 4]);
+        b = SIMD_MM(loadu_ps)(&sinctable[sincTableOffset + 4]);
+        o = SIMD_MM(add_ps)(o, SIMD_MM(mul_ps)(a, b));
 
-        a = _mm_loadu_ps(&buffer[readPtr + 8]);
-        b = _mm_loadu_ps(&sinctable[sincTableOffset + 8]);
-        o = _mm_add_ps(o, _mm_mul_ps(a, b));
+        a = SIMD_MM(loadu_ps)(&buffer[readPtr + 8]);
+        b = SIMD_MM(loadu_ps)(&sinctable[sincTableOffset + 8]);
+        o = SIMD_MM(add_ps)(o, SIMD_MM(mul_ps)(a, b));
 
         float res;
-        _mm_store_ss(&res, sst::basic_blocks::mechanics::sum_ps_to_ss(o));
+        SIMD_MM(store_ss)(&res, sst::basic_blocks::mechanics::sum_ps_to_ss(o));
 
         return res;
     }
diff --git a/include/sst/basic-blocks/mechanics/simd-ops.h b/include/sst/basic-blocks/mechanics/simd-ops.h
index 5227b0d..a9cc743 100644
--- a/include/sst/basic-blocks/mechanics/simd-ops.h
+++ b/include/sst/basic-blocks/mechanics/simd-ops.h
@@ -23,24 +23,27 @@
  * All source in sst-basic-blocks available at
  * https://github.com/surge-synthesizer/sst-basic-blocks
  */
+
 #ifndef INCLUDE_SST_BASIC_BLOCKS_MECHANICS_SIMD_OPS_H
 #define INCLUDE_SST_BASIC_BLOCKS_MECHANICS_SIMD_OPS_H
 
+#include "sst/basic-blocks/simd/setup.h"
+
 namespace sst::basic_blocks::mechanics
 {
-inline __m128 sum_ps_to_ss(__m128 x)
+inline SIMD_M128 sum_ps_to_ss(SIMD_M128 x)
 {
     // FIXME: With SSE 3 this can be a dual hadd
-    __m128 a = _mm_add_ps(x, _mm_movehl_ps(x, x));
-    return _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 1)));
+    auto a = SIMD_MM(add_ps)(x, SIMD_MM(movehl_ps)(x, x));
+    return SIMD_MM(add_ss)(a, SIMD_MM(shuffle_ps)(a, a, SIMD_MM_SHUFFLE(0, 0, 0, 1)));
 }
 
-inline float sum_ps_to_float(__m128 x)
+inline float sum_ps_to_float(SIMD_M128 x)
 {
     // MSVC can ambiguously resolve this while it still lives in surge vt_dsp alas
-    __m128 r = sst::basic_blocks::mechanics::sum_ps_to_ss(x);
+    auto r = sst::basic_blocks::mechanics::sum_ps_to_ss(x);
     float f;
-    _mm_store_ss(&f, r);
+    SIMD_MM(store_ss)(&f, r);
     return f;
 }
 
@@ -53,14 +56,14 @@ inline float i2f_binary_cast(int i)
 }
 } // namespace detail
 
-const __m128 m128_mask_signbit = _mm_set1_ps(detail::i2f_binary_cast(0x80000000));
-const __m128 m128_mask_absval = _mm_set1_ps(detail::i2f_binary_cast(0x7fffffff));
+const auto m128_mask_signbit = SIMD_MM(set1_ps)(detail::i2f_binary_cast(0x80000000));
+const auto m128_mask_absval = SIMD_MM(set1_ps)(detail::i2f_binary_cast(0x7fffffff));
 
-inline __m128 abs_ps(__m128 x) { return _mm_and_ps(x, m128_mask_absval); }
+inline SIMD_M128 abs_ps(SIMD_M128 x) { return SIMD_MM(and_ps)(x, m128_mask_absval); }
 
 inline float rcp(float x)
 {
-    _mm_store_ss(&x, _mm_rcp_ss(_mm_load_ss(&x)));
+    SIMD_MM(store_ss)(&x, SIMD_MM(rcp_ss)(SIMD_MM(load_ss)(&x)));
     return x;
 }
 
diff --git a/include/sst/basic-blocks/modulators/ADSREnvelope.h b/include/sst/basic-blocks/modulators/ADSREnvelope.h
index b61d54e..f13f65c 100644
--- a/include/sst/basic-blocks/modulators/ADSREnvelope.h
+++ b/include/sst/basic-blocks/modulators/ADSREnvelope.h
@@ -30,6 +30,7 @@
 #include <cmath>
 #include <cassert>
 #include <algorithm>
+#include "sst/basic-blocks/simd/setup.h"
 #include "DiscreteStagesEnvelope.h"
 
 namespace sst::basic_blocks::modulators
@@ -209,8 +210,8 @@ struct ADSREnvelope : DiscreteStagesEnvelope<BLOCK_SIZE, RangeProvider>
         const float v_cc = 1.01f;
         float v_gate = gateActive ? v_cc : 0.f;
 
-        // discharge = _mm_and_ps(_mm_or_ps(_mm_cmpgt_ss(v_c1_delayed, one), discharge),
-        // v_gate);
+        // discharge = SIMD_MM(and_ps)(SIMD_MM(or_ps)(SIMD_MM(cmpgt_ss)(v_c1_delayed, one),
+        // discharge), v_gate);
         discharge = ((v_c1_delayed >= 1) || discharge) && gateActive;
         v_c1_delayed = v_c1;
 
diff --git a/include/sst/basic-blocks/simd/setup.h b/include/sst/basic-blocks/simd/setup.h
new file mode 100644
index 0000000..b11f773
--- /dev/null
+++ b/include/sst/basic-blocks/simd/setup.h
@@ -0,0 +1,108 @@
+/*
+ * sst-basic-blocks - an open source library of core audio utilities
+ * built by Surge Synth Team.
+ *
+ * Provides a collection of tools useful on the audio thread for blocks,
+ * modulation, etc... or useful for adapting code to multiple environments.
+ *
+ * Copyright 2023, various authors, as described in the GitHub
+ * transaction log. Parts of this code are derived from similar
+ * functions original in Surge or ShortCircuit.
+ *
+ * sst-basic-blocks is released under the GNU General Public Licence v3
+ * or later (GPL-3.0-or-later). The license is found in the "LICENSE"
+ * file in the root of this repository, or at
+ * https://www.gnu.org/licenses/gpl-3.0.en.html.
+ *
+ * A very small number of explicitly chosen header files can also be
+ * used in an MIT/BSD context. Please see the README.md file in this
+ * repo or the comments in the individual files. Only headers with an
+ * explicit mention that they are dual licensed may be copied and reused
+ * outside the GPL3 terms.
+ *
+ * All source in sst-basic-blocks available at
+ * https://github.com/surge-synthesizer/sst-basic-blocks
+ */
+
+#ifndef INCLUDE_SST_BASIC_BLOCKS_SIMD_SETUP_H
+#define INCLUDE_SST_BASIC_BLOCKS_SIMD_SETUP_H
+
+/**
+ * \page the sst/basic-blocks/simd/setup.h header providces a set of macros and defines
+ * for simd inclusion which we can use across all of our properties, which conditionally
+ * sets up the https://github.com/simd-everywhere/simde in various ways, and defines some
+ * macros which, especiall with the introduction of windows arm64ec builds (which define
+ * the sse intrinsics, but define them as emulation points) allows us to actually get
+ * native code on all platforms
+ *
+ * This header is controlled by a few important macros
+ * SST_SIMD_OMIT_NATIVE_ALIASES - on ARM etc platforms do not eject native aliases
+ *
+ * This header also defines a few important macros you can use
+ *
+ * Programming macros:
+ * SIMD_M128  -> can be used as an alias for SIMD_M128 in all code in all settings
+ * SIMD_MM(x) -> creates the appropriate function. Replace SIMD_MM(set1_ps)(2.f) with
+ * SIMD_MM(set1_ps)(2.f)
+ *
+ * from pre-existing _mm code youc an use scripts/fix_simd.pl on a directory
+ *
+ * Conditions:
+ * SST_SIMD_NATIVE_X86  - you are on an x86 / sse2 hardware platform
+ * SST_SIMD_ARM64EC - you are in microsoft arm64 emulation compatible mode
+ * SST_SIMD_ARM64 - you are on an arm64 platform without emulation
+ */
+
+#if (defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) ||                                  \
+     (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+#define SST_SIMD_NATIVE_X86
+#endif
+
+#if defined(_M_ARM64EC)
+#define SST_SIMD_ARM64EC
+#endif
+
+#if defined(__aarch64__) || defined(__arm64) || defined(__arm64__) || defined(_M_ARM64) ||         \
+    defined(_M_ARM64EC)
+#define SST_SIMD_ARM64
+#endif
+
+/*
+ * Include the appropriate intrinsic header
+ */
+#ifdef SST_SIMD_ARM64EC
+#include <intrin.h>
+#endif
+
+#ifdef SST_SIMD_NATIVE_X86
+#include <emmintrin.h>
+#endif
+
+/*
+ * Include SIMDE
+ */
+#ifndef SIMDE_UNAVAILABLE
+#ifdef SST_SIMD_ARM64EC
+#include <cmath>
+#endif
+
+#ifndef SST_SIMD_NATIVE_X86
+#ifndef SST_SIMD_OMIT_NATIVE_ALIASES
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#endif
+#endif
+
+#include "simde/x86/sse2.h"
+#endif
+
+#if defined(SST_SIMD_NATIVE_X86) || defined(SIMDE_UNAVAILABLE)
+#define SIMD_MM(x) _mm_##x
+#define SIMD_M128 __m128
+#define SIMD_MM_SHUFFLE _MM_SHUFFLE
+#else
+#define SIMD_MM(x) simde_mm_##x
+#define SIMD_M128 simde__m128
+#define SIMD_MM_SHUFFLE SIMDE_MM_SHUFFLE
+#endif
+
+#endif // SETUP_H
diff --git a/scripts/fix_simd.pl b/scripts/fix_simd.pl
new file mode 100755
index 0000000..292dd4e
--- /dev/null
+++ b/scripts/fix_simd.pl
@@ -0,0 +1,48 @@
+#!/usr/bin/perl
+
+
+use File::Find;
+use File::Basename;
+
+print("Running find on $ARGV[0]");
+
+find(
+    {
+        wanted => \&findfiles,
+    },
+    $ARGV[0]
+);
+
+
+sub findfiles
+{
+    $f = $File::Find::name;
+
+
+    if ($f =~ m/\.h$/ or $f =~ m/.cpp$/)
+    {
+        print $f . "\n";
+
+        $src = $_;
+        open(IN, "< $_");
+        open(OUT, "> $_.bak");
+        while (<IN>)
+        {
+            # Replacement one. __m128 foo = bar -> auto foo = bar
+            s/__m128 (\S+) =/auto $1 =/g;
+
+            # Replacement two. Any other __m128 goes to SIMD_M128
+            s/__m128 /SIMD_M128 /g;
+
+            # Replacement three. Any _mm_foo_bar goes to SIMD_MM(foo_bar)
+            s/_mm_([^\(]+)\(/SIMD_MM($1)(/g;
+
+            s/_MM_SHUFFLE/SIMD_MM_SHUFFLE/g;
+            print OUT;
+        }
+        close(OUT);
+        close(IN);
+        system("mv $src.bak $src");
+        # die "foo";
+    }
+}
\ No newline at end of file
diff --git a/tests/block_tests.cpp b/tests/block_tests.cpp
index 66faaa3..b1b6041 100644
--- a/tests/block_tests.cpp
+++ b/tests/block_tests.cpp
@@ -25,7 +25,7 @@
  */
 
 #include "catch2.hpp"
-#include "smoke_test_sse.h"
+#include "sst/basic-blocks/simd/setup.h"
 
 #include "sst/basic-blocks/mechanics/block-ops.h"
 
diff --git a/tests/dsp_tests.cpp b/tests/dsp_tests.cpp
index 71d766e..5e39446 100644
--- a/tests/dsp_tests.cpp
+++ b/tests/dsp_tests.cpp
@@ -25,7 +25,7 @@
  */
 
 #include "catch2.hpp"
-#include "smoke_test_sse.h"
+#include "sst/basic-blocks/simd/setup.h"
 #include <cmath>
 #include <array>
 #include <iostream>
@@ -431,13 +431,13 @@ TEST_CASE("Check FastMath Functions", "[dsp]")
         for (float x = -4.9; x < 4.9; x += 0.02)
         {
             INFO("Testing unclamped at " << x);
-            auto q = _mm_set_ps1(x);
+            auto q = SIMD_MM(set_ps1)(x);
             auto r = sst::basic_blocks::dsp::fasttanhSSE(q);
             auto rn = tanh(x);
             auto rd = sst::basic_blocks::dsp::fasttanh(x);
             union
             {
-                __m128 v;
+                SIMD_M128 v;
                 float a[4];
             } U;
             U.v = r;
@@ -448,12 +448,12 @@ TEST_CASE("Check FastMath Functions", "[dsp]")
         for (float x = -10; x < 10; x += 0.02)
         {
             INFO("Testing clamped at " << x);
-            auto q = _mm_set_ps1(x);
+            auto q = SIMD_MM(set_ps1)(x);
             auto r = sst::basic_blocks::dsp::fasttanhSSEclamped(q);
             auto cn = tanh(x);
             union
             {
-                __m128 v;
+                SIMD_M128 v;
                 float a[4];
             } U;
             U.v = r;
@@ -478,13 +478,13 @@ TEST_CASE("Check FastMath Functions", "[dsp]")
         for (float x = -3.9; x < 2.9; x += 0.02)
         {
             INFO("Testing fastexp at " << x);
-            auto q = _mm_set_ps1(x);
+            auto q = SIMD_MM(set_ps1)(x);
             auto r = sst::basic_blocks::dsp::fastexpSSE(q);
             auto rn = exp(x);
             auto rd = sst::basic_blocks::dsp::fastexp(x);
             union
             {
-                __m128 v;
+                SIMD_M128 v;
                 float a[4];
             } U;
             U.v = r;
@@ -507,13 +507,13 @@ TEST_CASE("Check FastMath Functions", "[dsp]")
         for (float x = -3.14; x < 3.14; x += 0.02)
         {
             INFO("Testing unclamped at " << x);
-            auto q = _mm_set_ps1(x);
+            auto q = SIMD_MM(set_ps1)(x);
             auto r = sst::basic_blocks::dsp::fastsinSSE(q);
             auto rn = sin(x);
             auto rd = sst::basic_blocks::dsp::fastsin(x);
             union
             {
-                __m128 v;
+                SIMD_M128 v;
                 float a[4];
             } U;
             U.v = r;
@@ -528,13 +528,13 @@ TEST_CASE("Check FastMath Functions", "[dsp]")
         for (float x = -3.14; x < 3.14; x += 0.02)
         {
             INFO("Testing unclamped at " << x);
-            auto q = _mm_set_ps1(x);
+            auto q = SIMD_MM(set_ps1)(x);
             auto r = sst::basic_blocks::dsp::fastcosSSE(q);
             auto rn = cos(x);
             auto rd = sst::basic_blocks::dsp::fastcos(x);
             union
             {
-                __m128 v;
+                SIMD_M128 v;
                 float a[4];
             } U;
             U.v = r;
@@ -548,12 +548,12 @@ TEST_CASE("Check FastMath Functions", "[dsp]")
     {
         for (float f = -800.7; f < 816.4; f += 0.245)
         {
-            auto fs = _mm_set_ps1(f);
+            auto fs = SIMD_MM(set_ps1)(f);
 
             auto q = sst::basic_blocks::dsp::clampToPiRangeSSE(fs);
             union
             {
-                __m128 v;
+                SIMD_M128 v;
                 float a[4];
             } U;
             U.v = q;
@@ -573,9 +573,9 @@ TEST_CASE("SoftClip", "[dsp]")
     r[2] = 0.6;
     r[3] = 1.7;
 
-    auto v = _mm_load_ps(r);
+    auto v = SIMD_MM(load_ps)(r);
     auto c = sst::basic_blocks::dsp::softclip_ps(v);
-    _mm_store_ps(r, c);
+    SIMD_MM(store_ps)(r, c);
     REQUIRE(r[0] == Approx(-1.0).margin(0.0001));
     REQUIRE(r[1] == Approx(-0.8 - 4.f / 27.f * pow(-0.8, 3)).margin(0.0001));
     REQUIRE(r[2] == Approx(0.6 - 4.f / 27.f * pow(0.6, 3)).margin(0.0001));
diff --git a/tests/modulator_tests.cpp b/tests/modulator_tests.cpp
index 53672fb..37401ea 100644
--- a/tests/modulator_tests.cpp
+++ b/tests/modulator_tests.cpp
@@ -25,7 +25,7 @@
  */
 
 #include "catch2.hpp"
-#include "smoke_test_sse.h"
+#include "sst/basic-blocks/simd/setup.h"
 #include "sst/basic-blocks/modulators/FXModControl.h"
 
 namespace smod = sst::basic_blocks::modulators;
diff --git a/tests/param_tests.cpp b/tests/param_tests.cpp
index ca06dd3..cc910fc 100644
--- a/tests/param_tests.cpp
+++ b/tests/param_tests.cpp
@@ -25,7 +25,7 @@
  */
 
 #include "catch2.hpp"
-#include "smoke_test_sse.h"
+#include "sst/basic-blocks/simd/setup.h"
 #include <cmath>
 #include <iostream>
 #include <type_traits>
diff --git a/tests/run_envelopes.cpp b/tests/run_envelopes.cpp
index fb6f16b..4d42d74 100644
--- a/tests/run_envelopes.cpp
+++ b/tests/run_envelopes.cpp
@@ -29,7 +29,7 @@
 #include <iostream>
 #include <iomanip>
 #include <chrono>
-#include "smoke_test_sse.h"
+#include "sst/basic-blocks/simd/setup.h"
 
 #include "sst/basic-blocks/dsp/CorrelatedNoise.h"
 #include "sst/basic-blocks/dsp/Interpolators.h"
diff --git a/tests/simd_tests.cpp b/tests/simd_tests.cpp
index 673874a..6d846a4 100644
--- a/tests/simd_tests.cpp
+++ b/tests/simd_tests.cpp
@@ -25,24 +25,24 @@
  */
 
 #include "catch2.hpp"
-#include "smoke_test_sse.h"
 
+#include "sst/basic-blocks/simd/setup.h"
 #include "sst/basic-blocks/mechanics/simd-ops.h"
 
 #include <iostream>
 
 TEST_CASE("abs_ps", "[simd]")
 {
-    auto p1 = _mm_set1_ps(13.2);
-    auto p2 = _mm_set1_ps(-142.3);
+    auto p1 = SIMD_MM(set1_ps)(13.2);
+    auto p2 = SIMD_MM(set1_ps)(-142.3);
     auto ap1 = sst::basic_blocks::mechanics::abs_ps(p1);
     auto ap2 = sst::basic_blocks::mechanics::abs_ps(p2);
 
     float res alignas(16)[4];
-    _mm_store_ps(res, ap1);
+    SIMD_MM(store_ps)(res, ap1);
     REQUIRE(res[0] == Approx(13.2).margin(0.0001));
 
-    _mm_store_ps(res, ap2);
+    SIMD_MM(store_ps)(res, ap2);
     REQUIRE(res[0] == Approx(142.3).margin(0.00001));
 }
 
@@ -53,6 +53,6 @@ TEST_CASE("Sums", "[simd]")
     res[1] = 0.2;
     res[2] = 0.3;
     res[3] = 0.4;
-    auto val = _mm_load_ps(res);
+    auto val = SIMD_MM(load_ps)(res);
     REQUIRE(sst::basic_blocks::mechanics::sum_ps_to_float(val) == Approx(1.0).margin(0.00001));
 }
\ No newline at end of file
diff --git a/tests/smoke_test_sse.h b/tests/smoke_test_sse.h
deleted file mode 100644
index b008097..0000000
--- a/tests/smoke_test_sse.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * sst-basic-blocks - an open source library of core audio utilities
- * built by Surge Synth Team.
- *
- * Provides a collection of tools useful on the audio thread for blocks,
- * modulation, etc... or useful for adapting code to multiple environments.
- *
- * Copyright 2023, various authors, as described in the GitHub
- * transaction log. Parts of this code are derived from similar
- * functions original in Surge or ShortCircuit.
- *
- * sst-basic-blocks is released under the GNU General Public Licence v3
- * or later (GPL-3.0-or-later). The license is found in the "LICENSE"
- * file in the root of this repository, or at
- * https://www.gnu.org/licenses/gpl-3.0.en.html.
- *
- * A very small number of explicitly chosen header files can also be
- * used in an MIT/BSD context. Please see the README.md file in this
- * repo or the comments in the individual files. Only headers with an
- * explicit mention that they are dual licensed may be copied and reused
- * outside the GPL3 terms.
- *
- * All source in sst-basic-blocks available at
- * https://github.com/surge-synthesizer/sst-basic-blocks
- */
-
-#ifndef SST_BASIC_BLOCK_TESTS_SMOKE_TEST_SSE_H
-#define SST_BASIC_BLOCK_TESTS_SMOKE_TEST_SSE_H
-
-#if defined(__arm64__)
-#define SIMDE_ENABLE_NATIVE_ALIASES
-#include "simde/x86/sse2.h"
-#else
-#include <emmintrin.h>
-#endif
-
-#endif // SHORTCIRCUITXT_SMOKE_TEST_SSE_H
diff --git a/tests/smoketest.cpp b/tests/smoketest.cpp
index 34366f1..73d9403 100644
--- a/tests/smoketest.cpp
+++ b/tests/smoketest.cpp
@@ -30,8 +30,7 @@
 #include "catch2.hpp"
 
 #include <iostream>
-#include "smoke_test_sse.h"
-
+#include "sst/basic-blocks/simd/setup.h"
 #include "sst/basic-blocks/dsp/CorrelatedNoise.h"
 #include "sst/basic-blocks/dsp/Interpolators.h"
 #include "sst/basic-blocks/dsp/BlockInterpolators.h"
diff --git a/tests/table_tests.cpp b/tests/table_tests.cpp
index 9d13ea3..f3cb7c2 100644
--- a/tests/table_tests.cpp
+++ b/tests/table_tests.cpp
@@ -25,7 +25,7 @@
  */
 
 #include "catch2.hpp"
-#include "smoke_test_sse.h"
+#include "sst/basic-blocks/simd/setup.h"
 
 #include "sst/basic-blocks/tables/DbToLinearProvider.h"
 #include "sst/basic-blocks/tables/EqualTuningProvider.h"