diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index f89d461..e60bbd8 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -7,22 +7,48 @@ on: jobs: build_feature: - name: Build Tests + name: Test ${{ matrix.name }} runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: - os: [ ubuntu-latest, macos-latest, windows-latest ] include: - os: ubuntu-latest name: linux + runTest: true testExe: build/sst-basic-blocks-test + - os: macos-latest - name: mac + name: mac-x86 + runTest: true testExe: build/sst-basic-blocks-test + cmakeArgs: -DCMAKE_OSX_ARCHITECTURES=x86_64 + + - os: macos-latest + name: mac-arm + cmakeArgs: -DCMAKE_OSX_ARCHITECTURES=arm64 + + - os: macos-latest + name: mac-arm-nonative + cmakeArgs: -DCMAKE_OSX_ARCHITECTURES=arm64 -DSST_BASIC_BLOCKS_SIMD_OMIT_NATIVE_ALIASES=TRUE + - os: windows-latest - name: win + name: win-x86 + runTest: true testExe: build/Release/sst-basic-blocks-test.exe + - os: windows-latest + name: win-arm64 + cmakeArgs: -G"Visual Studio 17 2022" -A arm64 -DCMAKE_SYSTEM_VERSION=10 + + - os: windows-latest + name: win-arm64ec + cmakeArgs: -G"Visual Studio 17 2022" -A arm64ec -DCMAKE_SYSTEM_VERSION=10 + + - os: windows-latest + name: win-arm64-non-native + cmakeArgs: -G"Visual Studio 17 2022" -A arm64 -DCMAKE_SYSTEM_VERSION=10 -DSST_BASIC_BLOCKS_SIMD_OMIT_NATIVE_ALIASES=TRUE + steps: - name: Checkout code @@ -32,10 +58,11 @@ jobs: - name: Build Smoke test run: | - cmake -S . -B ./build -DCMAKE_BUILD_TYPE=Release -DSST_BASIC_BLOCKS_BUILD_TESTS=TRUE -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" + cmake -S . -B ./build -DCMAKE_BUILD_TYPE=Release ${{ matrix.cmakeArgs }} -DSST_BASIC_BLOCKS_BUILD_TESTS=TRUE cmake --build ./build --config Release - name: Run Smoke Test + if: ${{ matrix.runTest }} run: | ls ${{ matrix.testExe }} ${{ matrix.testExe }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 6814ab6..a8a0b60 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,16 @@ set(CMAKE_CXX_STANDARD 17) add_library(${PROJECT_NAME} INTERFACE) target_include_directories(${PROJECT_NAME} INTERFACE include) -target_compile_definitions(${PROJECT_NAME} INTERFACE _USE_MATH_DEFINES=1) + +if (${SST_BASIC_BLOCKS_SIMD_OMIT_NATIVE_ALIASES}) + message(STATUS "CMake Omitting Native Aliases") + target_compile_definitions(${PROJECT_NAME} INTERFACE SST_SIMD_OMIT_NATIVE_ALIASES=1) +endif() + +if (WIN32) + target_compile_definitions(${PROJECT_NAME} INTERFACE _USE_MATH_DEFINES=1) + target_compile_definitions(${PROJECT_NAME} INTERFACE NOMINMAX) +endif() if (${SST_BASIC_BLOCKS_BUILD_TESTS}) include(cmake/CPM.cmake) @@ -52,4 +61,10 @@ if (${SST_BASIC_BLOCKS_BUILD_TESTS}) message(STATUS "Keeping Catch exception handling on for more modern macOS") endif() +else() + if (NOT TARGET simde) + message(WARNING "SST Basic Blocks requires access to the 'simde' target from " + "https://github.com/simde-everywhere/simde. This build will only work on x86_64 architecture.") + target_compile_definitions(${PROJECT_NAME} SIMDE_UNAVAILABLE=1) + endif() endif () diff --git a/include/sst/basic-blocks/dsp/BlockInterpolators.h b/include/sst/basic-blocks/dsp/BlockInterpolators.h index 622e18a..e3f7427 100644 --- a/include/sst/basic-blocks/dsp/BlockInterpolators.h +++ b/include/sst/basic-blocks/dsp/BlockInterpolators.h @@ -28,6 +28,7 @@ #define INCLUDE_SST_BASIC_BLOCKS_DSP_BLOCKINTERPOLATORS_H #include +#include "sst/basic-blocks/simd/setup.h" namespace sst::basic_blocks::dsp { @@ -79,9 +80,9 @@ template struct alignas(16) lip { private: // put these at the top to preserve alignment - __m128 line[maxBlockSize >> 2]; - __m128 zeroUpByQuarters; - __m128 one, zero; + SIMD_M128 line[maxBlockSize >> 2]; + SIMD_M128 zeroUpByQuarters; + SIMD_M128 one, zero; public: static constexpr int maxRegisters{maxBlockSize >> 2}; @@ -99,9 +100,9 @@ template struct alignas(16) lip lipol_sse() { float zbq alignas(16)[4]{0.25f, 0.5f, 0.75f, 1.00f}; - zeroUpByQuarters = _mm_load_ps(zbq); - one = _mm_set1_ps(1.f); - zero = _mm_setzero_ps(); + zeroUpByQuarters = SIMD_MM(load_ps)(zbq); + one = SIMD_MM(set1_ps)(1.f); + zero = SIMD_MM(setzero_ps)(); } void set_target(float f) { @@ -152,9 +153,9 @@ template struct alignas(16) lip assert(bsQuad == -1 || bsQuad == numRegisters); for (int i = 0; i < numRegisters; ++i) { - auto iv = _mm_load_ps(in + (i << 2)); - auto ov = _mm_mul_ps(iv, line[i]); - _mm_store_ps(out + (i << 2), ov); + auto iv = SIMD_MM(load_ps)(in + (i << 2)); + auto ov = SIMD_MM(mul_ps)(iv, line[i]); + SIMD_MM(store_ps)(out + (i << 2), ov); } } @@ -163,9 +164,9 @@ template struct alignas(16) lip assert(bsQuad == -1 || bsQuad == numRegisters); for (int i = 0; i < numRegisters; ++i) { - auto iv = _mm_load_ps(in + (i << 2)); - auto ov = _mm_mul_ps(iv, line[i]); - _mm_store_ps(in + (i << 2), ov); + auto iv = SIMD_MM(load_ps)(in + (i << 2)); + auto ov = SIMD_MM(mul_ps)(iv, line[i]); + SIMD_MM(store_ps)(in + (i << 2), ov); } } @@ -190,11 +191,11 @@ template struct alignas(16) lip assert(bsQuad == -1 || bsQuad == numRegisters); for (int i = 0; i < numRegisters; ++i) { - auto iv = _mm_load_ps(src + (i << 2)); - auto dv = _mm_load_ps(dst + (i << 2)); - auto ov = _mm_mul_ps(iv, line[i]); - auto mv = _mm_add_ps(ov, dv); - _mm_store_ps(dst + (i << 2), mv); + auto iv = SIMD_MM(load_ps)(src + (i << 2)); + auto dv = SIMD_MM(load_ps)(dst + (i << 2)); + auto ov = SIMD_MM(mul_ps)(iv, line[i]); + auto mv = SIMD_MM(add_ps)(ov, dv); + SIMD_MM(store_ps)(dst + (i << 2), mv); } } void MAC_2_blocks_to(float *__restrict src1, float *__restrict src2, float *__restrict dst1, @@ -211,12 +212,12 @@ template struct alignas(16) lip { for (int i = 0; i < numRegisters; ++i) { - auto a = _mm_load_ps(inA + (i << 2)); - auto b = _mm_load_ps(inB + (i << 2)); - auto sa = _mm_mul_ps(a, _mm_sub_ps(one, line[i])); - auto sb = _mm_mul_ps(b, line[i]); - auto r = _mm_add_ps(sa, sb); - _mm_store_ps(out + (i << 2), r); + auto a = SIMD_MM(load_ps)(inA + (i << 2)); + auto b = SIMD_MM(load_ps)(inB + (i << 2)); + auto sa = SIMD_MM(mul_ps)(a, SIMD_MM(sub_ps)(one, line[i])); + auto sb = SIMD_MM(mul_ps)(b, line[i]); + auto r = SIMD_MM(add_ps)(sa, sb); + SIMD_MM(store_ps)(out + (i << 2), r); } } @@ -238,12 +239,12 @@ template struct alignas(16) lip { for (int i = 0; i < numRegisters; ++i) { - auto a = _mm_load_ps(inAOut + (i << 2)); - auto b = _mm_load_ps(inB + (i << 2)); - auto sa = _mm_mul_ps(a, _mm_sub_ps(one, line[i])); - auto sb = _mm_mul_ps(b, line[i]); - auto r = _mm_add_ps(sa, sb); - _mm_store_ps(inAOut + (i << 2), r); + auto a = SIMD_MM(load_ps)(inAOut + (i << 2)); + auto b = SIMD_MM(load_ps)(inB + (i << 2)); + auto sa = SIMD_MM(mul_ps)(a, SIMD_MM(sub_ps)(one, line[i])); + auto sb = SIMD_MM(mul_ps)(b, line[i]); + auto r = SIMD_MM(add_ps)(sa, sb); + SIMD_MM(store_ps)(inAOut + (i << 2), r); } } @@ -261,7 +262,7 @@ template struct alignas(16) lip assert(bsQuad == -1 || bsQuad == numRegisters); for (int i = 0; i < numRegisters; ++i) { - _mm_store_ps(out + (i << 2), line[i]); + SIMD_MM(store_ps)(out + (i << 2), line[i]); } } @@ -279,14 +280,16 @@ template struct alignas(16) lip for (int i = 0; i < numRegisters; ++i) { - auto a = _mm_max_ps(zero, line[i]); - auto b = _mm_min_ps(zero, line[i]); - auto l = _mm_load_ps(L + (i << 2)); - auto r = _mm_load_ps(R + (i << 2)); - auto tl = _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(one, a), l), _mm_mul_ps(b, r)); - auto tr = _mm_add_ps(_mm_mul_ps(a, l), _mm_mul_ps(_mm_add_ps(one, b), r)); - _mm_store_ps(dL + (i << 2), tl); - _mm_store_ps(dR + (i << 2), tr); + auto a = SIMD_MM(max_ps)(zero, line[i]); + auto b = SIMD_MM(min_ps)(zero, line[i]); + auto l = SIMD_MM(load_ps)(L + (i << 2)); + auto r = SIMD_MM(load_ps)(R + (i << 2)); + auto tl = + SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(SIMD_MM(sub_ps)(one, a), l), SIMD_MM(mul_ps)(b, r)); + auto tr = + SIMD_MM(add_ps)(SIMD_MM(mul_ps)(a, l), SIMD_MM(mul_ps)(SIMD_MM(add_ps)(one, b), r)); + SIMD_MM(store_ps)(dL + (i << 2), tl); + SIMD_MM(store_ps)(dR + (i << 2), tr); } } @@ -304,13 +307,13 @@ template struct alignas(16) lip private: void updateLine() { - auto cs = _mm_set1_ps(current); - auto dy0 = _mm_set1_ps((target - current) * registerSizeInv); - auto dy = _mm_mul_ps(dy0, zeroUpByQuarters); + auto cs = SIMD_MM(set1_ps)(current); + auto dy0 = SIMD_MM(set1_ps)((target - current) * registerSizeInv); + auto dy = SIMD_MM(mul_ps)(dy0, zeroUpByQuarters); for (int i = 0; i < numRegisters; ++i) { - line[i] = _mm_add_ps(cs, dy); - dy = _mm_add_ps(dy, dy0); + line[i] = SIMD_MM(add_ps)(cs, dy); + dy = SIMD_MM(add_ps)(dy, dy0); } current = target; } diff --git a/include/sst/basic-blocks/dsp/Clippers.h b/include/sst/basic-blocks/dsp/Clippers.h index b5bf4a9..04c048c 100644 --- a/include/sst/basic-blocks/dsp/Clippers.h +++ b/include/sst/basic-blocks/dsp/Clippers.h @@ -27,24 +27,26 @@ #ifndef INCLUDE_SST_BASIC_BLOCKS_DSP_CLIPPERS_H #define INCLUDE_SST_BASIC_BLOCKS_DSP_CLIPPERS_H +#include "sst/basic-blocks/simd/setup.h" + namespace sst::basic_blocks::dsp { /** * y = x - (4/27)*x^3, x in [-1.5 .. 1.5], +/-1 otherwise */ -inline __m128 softclip_ps(__m128 in) +inline SIMD_M128 softclip_ps(SIMD_M128 in) { - const __m128 a = _mm_set1_ps(-4.f / 27.f); + const auto a = SIMD_MM(set1_ps)(-4.f / 27.f); - const __m128 x_min = _mm_set1_ps(-1.5f); - const __m128 x_max = _mm_set1_ps(1.5f); + const auto x_min = SIMD_MM(set1_ps)(-1.5f); + const auto x_max = SIMD_MM(set1_ps)(1.5f); - __m128 x = _mm_max_ps(_mm_min_ps(in, x_max), x_min); - __m128 xx = _mm_mul_ps(x, x); - __m128 t = _mm_mul_ps(x, a); - t = _mm_mul_ps(t, xx); - t = _mm_add_ps(t, x); + auto x = SIMD_MM(max_ps)(SIMD_MM(min_ps)(in, x_max), x_min); + auto xx = SIMD_MM(mul_ps)(x, x); + auto t = SIMD_MM(mul_ps)(x, a); + t = SIMD_MM(mul_ps)(t, xx); + t = SIMD_MM(add_ps)(t, x); return t; } @@ -52,7 +54,7 @@ inline __m128 softclip_ps(__m128 in) /** * y = x - (4/27/8^3)*x^3, x in [-12 .. 12], +/-12 otherwise */ -inline __m128 softclip8_ps(__m128 in) +inline SIMD_M128 softclip8_ps(SIMD_M128 in) { /* * This constant is - 4/27 / 8^3 so it "scales" the @@ -61,44 +63,44 @@ inline __m128 softclip8_ps(__m128 in) * But this is only used in one spot - in LPMOOGquad - so * we will just leave it for now */ - const __m128 a = _mm_set1_ps(-0.00028935185185f); + const auto a = SIMD_MM(set1_ps)(-0.00028935185185f); - const __m128 x_min = _mm_set1_ps(-12.f); - const __m128 x_max = _mm_set1_ps(12.f); + const auto x_min = SIMD_MM(set1_ps)(-12.f); + const auto x_max = SIMD_MM(set1_ps)(12.f); - __m128 x = _mm_max_ps(_mm_min_ps(in, x_max), x_min); - __m128 xx = _mm_mul_ps(x, x); - __m128 t = _mm_mul_ps(x, a); - t = _mm_mul_ps(t, xx); - t = _mm_add_ps(t, x); + auto x = SIMD_MM(max_ps)(SIMD_MM(min_ps)(in, x_max), x_min); + auto xx = SIMD_MM(mul_ps)(x, x); + auto t = SIMD_MM(mul_ps)(x, a); + t = SIMD_MM(mul_ps)(t, xx); + t = SIMD_MM(add_ps)(t, x); return t; } -inline __m128 tanh7_ps(__m128 v) +inline SIMD_M128 tanh7_ps(SIMD_M128 v) { - const __m128 upper_bound = _mm_set1_ps(1.139f); - const __m128 lower_bound = _mm_set1_ps(-1.139f); - auto x = _mm_max_ps(v, lower_bound); - x = _mm_min_ps(x, upper_bound); - - const __m128 a = _mm_set1_ps(-1.f / 3.f); - const __m128 b = _mm_set1_ps(2.f / 15.f); - const __m128 c = _mm_set1_ps(-17.f / 315.f); - const __m128 one = _mm_set1_ps(1.f); - __m128 xx = _mm_mul_ps(x, x); - __m128 y = _mm_add_ps(one, _mm_mul_ps(a, xx)); - __m128 x4 = _mm_mul_ps(xx, xx); - y = _mm_add_ps(y, _mm_mul_ps(b, x4)); - x4 = _mm_mul_ps(x4, xx); - y = _mm_add_ps(y, _mm_mul_ps(c, x4)); - return _mm_mul_ps(y, x); + const auto upper_bound = SIMD_MM(set1_ps)(1.139f); + const auto lower_bound = SIMD_MM(set1_ps)(-1.139f); + auto x = SIMD_MM(max_ps)(v, lower_bound); + x = SIMD_MM(min_ps)(x, upper_bound); + + const auto a = SIMD_MM(set1_ps)(-1.f / 3.f); + const auto b = SIMD_MM(set1_ps)(2.f / 15.f); + const auto c = SIMD_MM(set1_ps)(-17.f / 315.f); + const auto one = SIMD_MM(set1_ps)(1.f); + auto xx = SIMD_MM(mul_ps)(x, x); + auto y = SIMD_MM(add_ps)(one, SIMD_MM(mul_ps)(a, xx)); + auto x4 = SIMD_MM(mul_ps)(xx, xx); + y = SIMD_MM(add_ps)(y, SIMD_MM(mul_ps)(b, x4)); + x4 = SIMD_MM(mul_ps)(x4, xx); + y = SIMD_MM(add_ps)(y, SIMD_MM(mul_ps)(c, x4)); + return SIMD_MM(mul_ps)(y, x); } template void softclip_block(float *__restrict x) { for (unsigned int i = 0; i < blockSize; i += 4) { - _mm_store_ps(x + i, softclip_ps(_mm_load_ps(x + i))); + SIMD_MM(store_ps)(x + i, softclip_ps(SIMD_MM(load_ps)(x + i))); } } @@ -106,29 +108,31 @@ template void tanh7_block(float *__restrict x) { for (unsigned int i = 0; i < blockSize; i += 4) { - _mm_store_ps(x + i, tanh7_ps(_mm_load_ps(x + i))); + SIMD_MM(store_ps)(x + i, tanh7_ps(SIMD_MM(load_ps)(x + i))); } } template void hardclip_block(float *x) { static_assert(!(blockSize & (blockSize - 1)) && blockSize >= 4); - const __m128 x_min = _mm_set1_ps(-1.0f); - const __m128 x_max = _mm_set1_ps(1.0f); + const auto x_min = SIMD_MM(set1_ps)(-1.0f); + const auto x_max = SIMD_MM(set1_ps)(1.0f); for (unsigned int i = 0; i < blockSize; i += 4) { - _mm_store_ps(x + i, _mm_max_ps(_mm_min_ps(_mm_load_ps(x + i), x_max), x_min)); + SIMD_MM(store_ps) + (x + i, SIMD_MM(max_ps)(SIMD_MM(min_ps)(SIMD_MM(load_ps)(x + i), x_max), x_min)); } } template void hardclip_block8(float *x) { static_assert(!(blockSize & (blockSize - 1)) && blockSize >= 4); - const __m128 x_min = _mm_set1_ps(-8.0f); - const __m128 x_max = _mm_set1_ps(8.0f); + const auto x_min = SIMD_MM(set1_ps)(-8.0f); + const auto x_max = SIMD_MM(set1_ps)(8.0f); for (unsigned int i = 0; i < blockSize; i += 4) { - _mm_store_ps(x + i, _mm_max_ps(_mm_min_ps(_mm_load_ps(x + i), x_max), x_min)); + SIMD_MM(store_ps) + (x + i, SIMD_MM(max_ps)(SIMD_MM(min_ps)(SIMD_MM(load_ps)(x + i), x_max), x_min)); } } } // namespace sst::basic_blocks::dsp diff --git a/include/sst/basic-blocks/dsp/CorrelatedNoise.h b/include/sst/basic-blocks/dsp/CorrelatedNoise.h index 382a825..d952767 100644 --- a/include/sst/basic-blocks/dsp/CorrelatedNoise.h +++ b/include/sst/basic-blocks/dsp/CorrelatedNoise.h @@ -29,6 +29,7 @@ #include #include +#include "sst/basic-blocks/simd/setup.h" namespace sst::basic_blocks::dsp { @@ -48,8 +49,8 @@ inline float correlated_noise_o2mk2_supplied_value(float &lastval, float &lastva wf = -wfabs; float m = 1.f - wfabs; // float m = 1.f/sqrt(1.f-wfabs); - auto m1 = _mm_rsqrt_ss(_mm_load_ss(&m)); - _mm_store_ss(&m, m1); + auto m1 = SIMD_MM(rsqrt_ss)(SIMD_MM(load_ss)(&m)); + SIMD_MM(store_ss)(&m, m1); // if (wf>0.f) m *= 1 + wf*8; float rand11 = bipolarUniformRandValue; diff --git a/include/sst/basic-blocks/dsp/FastMath.h b/include/sst/basic-blocks/dsp/FastMath.h index 1db63e4..2ab6f5b 100644 --- a/include/sst/basic-blocks/dsp/FastMath.h +++ b/include/sst/basic-blocks/dsp/FastMath.h @@ -28,6 +28,7 @@ #define INCLUDE_SST_BASIC_BLOCKS_DSP_FASTMATH_H #include +#include "sst/basic-blocks/simd/setup.h" /* ** Fast Math Approximations to various Functions @@ -58,13 +59,13 @@ inline float fastsin(float x) noexcept return numerator / denominator; } -inline __m128 fastsinSSE(__m128 x) noexcept +inline SIMD_M128 fastsinSSE(SIMD_M128 x) noexcept { -#define M(a, b) _mm_mul_ps(a, b) -#define A(a, b) _mm_add_ps(a, b) -#define S(a, b) _mm_sub_ps(a, b) -#define F(a) _mm_set_ps1(a) -#define C(x) __m128 m##x = F((float)x) +#define M(a, b) SIMD_MM(mul_ps)(a, b) +#define A(a, b) SIMD_MM(add_ps)(a, b) +#define S(a, b) SIMD_MM(sub_ps)(a, b) +#define F(a) SIMD_MM(set_ps1)(a) +#define C(x) auto m##x = F((float)x) /* auto numerator = -x * (-(float)11511339840 + @@ -92,7 +93,7 @@ inline __m128 fastsinSSE(__m128 x) noexcept #undef A #undef S #undef F - return _mm_div_ps(num, den); + return SIMD_MM(div_ps)(num, den); } // JUCE6 Pade approximation of cos valid from -PI to PI with max error of 1e-5 and average error of @@ -105,13 +106,13 @@ inline float fastcos(float x) noexcept return numerator / denominator; } -inline __m128 fastcosSSE(__m128 x) noexcept +inline SIMD_M128 fastcosSSE(SIMD_M128 x) noexcept { -#define M(a, b) _mm_mul_ps(a, b) -#define A(a, b) _mm_add_ps(a, b) -#define S(a, b) _mm_sub_ps(a, b) -#define F(a) _mm_set_ps1(a) -#define C(x) __m128 m##x = F((float)x) +#define M(a, b) SIMD_MM(mul_ps)(a, b) +#define A(a, b) SIMD_MM(add_ps)(a, b) +#define S(a, b) SIMD_MM(sub_ps)(a, b) +#define F(a) SIMD_MM(set_ps1)(a) +#define C(x) auto m##x = F((float)x) // auto x2 = x * x; auto x2 = M(x, x); @@ -134,7 +135,7 @@ inline __m128 fastcosSSE(__m128 x) noexcept #undef A #undef S #undef F - return _mm_div_ps(num, den); + return SIMD_MM(div_ps)(num, den); } /* @@ -156,20 +157,20 @@ inline float clampToPiRange(float x) return p - M_PI; } -inline __m128 clampToPiRangeSSE(__m128 x) +inline SIMD_M128 clampToPiRangeSSE(SIMD_M128 x) { - const auto mpi = _mm_set1_ps(M_PI); - const auto m2pi = _mm_set1_ps(2.0 * M_PI); - const auto oo2p = _mm_set1_ps(1.0 / (2.0 * M_PI)); - const auto mz = _mm_setzero_ps(); - - auto y = _mm_add_ps(x, mpi); - auto yip = _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_mul_ps(y, oo2p))); - auto p = _mm_sub_ps(y, _mm_mul_ps(m2pi, yip)); - auto off = _mm_and_ps(_mm_cmplt_ps(p, mz), m2pi); - p = _mm_add_ps(p, off); - - return _mm_sub_ps(p, mpi); + const auto mpi = SIMD_MM(set1_ps)(M_PI); + const auto m2pi = SIMD_MM(set1_ps)(2.0 * M_PI); + const auto oo2p = SIMD_MM(set1_ps)(1.0 / (2.0 * M_PI)); + const auto mz = SIMD_MM(setzero_ps)(); + + auto y = SIMD_MM(add_ps)(x, mpi); + auto yip = SIMD_MM(cvtepi32_ps)(SIMD_MM(cvttps_epi32)(SIMD_MM(mul_ps)(y, oo2p))); + auto p = SIMD_MM(sub_ps)(y, SIMD_MM(mul_ps)(m2pi, yip)); + auto off = SIMD_MM(and_ps)(SIMD_MM(cmplt_ps)(p, mz), m2pi); + p = SIMD_MM(add_ps)(p, off); + + return SIMD_MM(sub_ps)(p, mpi); } /* @@ -192,14 +193,14 @@ inline float fasttan(float x) noexcept return numerator / denominator; } -inline __m128 fasttanhSSE(__m128 x) +inline SIMD_M128 fasttanhSSE(SIMD_M128 x) { - const __m128 m135135 = _mm_set_ps1(135135), m17325 = _mm_set_ps1(17325), - m378 = _mm_set_ps1(378), m62370 = _mm_set_ps1(62370), m3150 = _mm_set_ps1(3150), - m28 = _mm_set_ps1(28); + const auto m135135 = SIMD_MM(set_ps1)(135135), m17325 = SIMD_MM(set_ps1)(17325), + m378 = SIMD_MM(set_ps1)(378), m62370 = SIMD_MM(set_ps1)(62370), + m3150 = SIMD_MM(set_ps1)(3150), m28 = SIMD_MM(set_ps1)(28); -#define M(a, b) _mm_mul_ps(a, b) -#define A(a, b) _mm_add_ps(a, b) +#define M(a, b) SIMD_MM(mul_ps)(a, b) +#define A(a, b) SIMD_MM(add_ps)(a, b) auto x2 = M(x, x); auto num = M(x, A(m135135, M(x2, A(m17325, M(x2, A(m378, x2)))))); @@ -208,12 +209,12 @@ inline __m128 fasttanhSSE(__m128 x) #undef M #undef A - return _mm_div_ps(num, den); + return SIMD_MM(div_ps)(num, den); } -inline __m128 fasttanhSSEclamped(__m128 x) +inline SIMD_M128 fasttanhSSEclamped(SIMD_M128 x) { - auto xc = _mm_min_ps(_mm_set_ps1(5), _mm_max_ps(_mm_set_ps1(-5), x)); + auto xc = SIMD_MM(min_ps)(SIMD_MM(set_ps1)(5), SIMD_MM(max_ps)(SIMD_MM(set_ps1)(-5), x)); return fasttanhSSE(xc); } @@ -227,19 +228,19 @@ inline float fastexp(float x) noexcept return numerator / denominator; } -inline __m128 fastexpSSE(__m128 x) noexcept +inline SIMD_M128 fastexpSSE(SIMD_M128 x) noexcept { -#define M(a, b) _mm_mul_ps(a, b) -#define A(a, b) _mm_add_ps(a, b) -#define F(a) _mm_set_ps1(a) +#define M(a, b) SIMD_MM(mul_ps)(a, b) +#define A(a, b) SIMD_MM(add_ps)(a, b) +#define F(a) SIMD_MM(set_ps1)(a) - const __m128 m1680 = F(1680), m840 = F(840), mneg840 = F(-840), m180 = F(180), m20 = F(20), - mneg20 = F(-20); + const auto m1680 = F(1680), m840 = F(840), mneg840 = F(-840), m180 = F(180), m20 = F(20), + mneg20 = F(-20); auto num = A(m1680, M(x, A(m840, M(x, A(m180, M(x, A(m20, x))))))); auto den = A(m1680, M(x, A(mneg840, M(x, A(m180, M(x, A(mneg20, x))))))); - return _mm_div_ps(num, den); + return SIMD_MM(div_ps)(num, den); #undef M #undef A diff --git a/include/sst/basic-blocks/dsp/HilbertTransform.h b/include/sst/basic-blocks/dsp/HilbertTransform.h index 5ad5ea2..d6a346a 100644 --- a/include/sst/basic-blocks/dsp/HilbertTransform.h +++ b/include/sst/basic-blocks/dsp/HilbertTransform.h @@ -59,6 +59,8 @@ #include #include +#include "sst/basic-blocks/simd/setup.h" + namespace sst::basic_blocks::dsp { struct HilbertTransformMonoFloat @@ -171,18 +173,18 @@ struct HilbertTransformStereoSSE */ struct BQ { - __m128 a1{1}, a2{0}, b0{1}, b1{0}, b2{0}, reg0{0}, reg1{0}; + SIMD_M128 a1{1}, a2{0}, b0{1}, b1{0}, b2{0}, reg0{0}, reg1{0}; inline void reset() { - reg0 = _mm_setzero_ps(); - reg1 = _mm_setzero_ps(); + reg0 = SIMD_MM(setzero_ps)(); + reg1 = SIMD_MM(setzero_ps)(); } - inline void setOne(int idx, __m128 &on, float f) + inline void setOne(int idx, SIMD_M128 &on, float f) { float r alignas(16)[4]; - _mm_store_ps(r, on); + SIMD_MM(store_ps)(r, on); r[idx] = f; - on = _mm_load_ps(r); + on = SIMD_MM(load_ps)(r); } inline void setCoefs(int idx, float _a1, float _a2, float _b0, float _b1, float _b2) { @@ -193,11 +195,12 @@ struct HilbertTransformStereoSSE setOne(idx, b2, _b2); } - inline __m128 step(__m128 input) + inline SIMD_M128 step(SIMD_M128 input) { - auto op = _mm_add_ps(_mm_mul_ps(input, b0), reg0); - reg0 = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(input, b1), _mm_mul_ps(a1, op)), reg1); - reg1 = _mm_sub_ps(_mm_mul_ps(input, b2), _mm_mul_ps(a2, op)); + auto op = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(input, b0), reg0); + reg0 = SIMD_MM(add_ps)( + SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(input, b1), SIMD_MM(mul_ps)(a1, op)), reg1); + reg1 = SIMD_MM(sub_ps)(SIMD_MM(mul_ps)(input, b2), SIMD_MM(mul_ps)(a2, op)); return op; } @@ -260,10 +263,10 @@ struct HilbertTransformStereoSSE } // Returns reL, imL, reR, imR - __m128 stepStereo(float L, float R) + SIMD_M128 stepStereo(float L, float R) { float r alignas(16)[4]{L, L, R, R}; - auto in = _mm_load_ps(r); + auto in = SIMD_MM(load_ps)(r); for (int i = 0; i < 3; ++i) { in = allpassSSE[i].step(in); @@ -275,7 +278,7 @@ struct HilbertTransformStereoSSE { auto v = stepStereo(L, R); float r alignas(16)[4]; - _mm_store_ps(r, v); + SIMD_MM(store_ps)(r, v); return {{r[0], r[1]}, {r[2], r[3]}}; } @@ -283,7 +286,7 @@ struct HilbertTransformStereoSSE { auto v = stepStereo(L, R); float r alignas(16)[4]; - _mm_store_ps(r, v); + SIMD_MM(store_ps)(r, v); return {{r[0], r[1]}, {r[2], r[3]}}; } }; diff --git a/include/sst/basic-blocks/dsp/LanczosResampler.h b/include/sst/basic-blocks/dsp/LanczosResampler.h index e248be5..9c131c9 100644 --- a/include/sst/basic-blocks/dsp/LanczosResampler.h +++ b/include/sst/basic-blocks/dsp/LanczosResampler.h @@ -50,6 +50,7 @@ #include #include #include +#include "sst/basic-blocks/simd/setup.h" #include "sst/basic-blocks/mechanics/simd-ops.h" namespace sst::basic_blocks::dsp @@ -168,24 +169,24 @@ template struct LanczosResampler int tidx = (int)(off0byto); double fidx = (off0byto - tidx); - auto fl = _mm_set1_ps((float)fidx); - auto f0 = _mm_load_ps(&lanczosTable[tidx][0]); - auto df0 = _mm_load_ps(&lanczosTableDX[tidx][0]); + auto fl = SIMD_MM(set1_ps)((float)fidx); + auto f0 = SIMD_MM(load_ps)(&lanczosTable[tidx][0]); + auto df0 = SIMD_MM(load_ps)(&lanczosTableDX[tidx][0]); - f0 = _mm_add_ps(f0, _mm_mul_ps(df0, fl)); + f0 = SIMD_MM(add_ps)(f0, SIMD_MM(mul_ps)(df0, fl)); - auto f1 = _mm_load_ps(&lanczosTable[tidx][4]); - auto df1 = _mm_load_ps(&lanczosTableDX[tidx][4]); - f1 = _mm_add_ps(f1, _mm_mul_ps(df1, fl)); + auto f1 = SIMD_MM(load_ps)(&lanczosTable[tidx][4]); + auto df1 = SIMD_MM(load_ps)(&lanczosTableDX[tidx][4]); + f1 = SIMD_MM(add_ps)(f1, SIMD_MM(mul_ps)(df1, fl)); - auto d0 = _mm_loadu_ps(&input[0][idx0 - A]); - auto d1 = _mm_loadu_ps(&input[0][idx0]); - auto rv = _mm_add_ps(_mm_mul_ps(f0, d0), _mm_mul_ps(f1, d1)); + auto d0 = SIMD_MM(loadu_ps)(&input[0][idx0 - A]); + auto d1 = SIMD_MM(loadu_ps)(&input[0][idx0]); + auto rv = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f0, d0), SIMD_MM(mul_ps)(f1, d1)); L = mechanics::sum_ps_to_float(rv); - d0 = _mm_loadu_ps(&input[1][idx0 - A]); - d1 = _mm_loadu_ps(&input[1][idx0]); - rv = _mm_add_ps(_mm_mul_ps(f0, d0), _mm_mul_ps(f1, d1)); + d0 = SIMD_MM(loadu_ps)(&input[1][idx0 - A]); + d1 = SIMD_MM(loadu_ps)(&input[1][idx0]); + rv = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(f0, d0), SIMD_MM(mul_ps)(f1, d1)); R = mechanics::sum_ps_to_float(rv); } diff --git a/include/sst/basic-blocks/dsp/SSESincDelayLine.h b/include/sst/basic-blocks/dsp/SSESincDelayLine.h index 5220d56..8a53cff 100644 --- a/include/sst/basic-blocks/dsp/SSESincDelayLine.h +++ b/include/sst/basic-blocks/dsp/SSESincDelayLine.h @@ -27,6 +27,7 @@ #ifndef INCLUDE_SST_BASIC_BLOCKS_DSP_SSESINCDELAYLINE_H #define INCLUDE_SST_BASIC_BLOCKS_DSP_SSESINCDELAYLINE_H +#include "sst/basic-blocks/simd/setup.h" #include "sst/basic-blocks/mechanics/simd-ops.h" #include "sst/basic-blocks/tables/SincTableProvider.h" @@ -80,20 +81,20 @@ struct SSESincDelayLine int readPtr = (wp - iDelay - (stp::FIRipol_N >> 1)) & (COMB_SIZE - 1); // And so now do what we do in COMBSSE2Quad - __m128 a = _mm_loadu_ps(&buffer[readPtr]); - __m128 b = _mm_loadu_ps(&sinctable[sincTableOffset]); - __m128 o = _mm_mul_ps(a, b); + auto a = SIMD_MM(loadu_ps)(&buffer[readPtr]); + auto b = SIMD_MM(loadu_ps)(&sinctable[sincTableOffset]); + auto o = SIMD_MM(mul_ps)(a, b); - a = _mm_loadu_ps(&buffer[readPtr + 4]); - b = _mm_loadu_ps(&sinctable[sincTableOffset + 4]); - o = _mm_add_ps(o, _mm_mul_ps(a, b)); + a = SIMD_MM(loadu_ps)(&buffer[readPtr + 4]); + b = SIMD_MM(loadu_ps)(&sinctable[sincTableOffset + 4]); + o = SIMD_MM(add_ps)(o, SIMD_MM(mul_ps)(a, b)); - a = _mm_loadu_ps(&buffer[readPtr + 8]); - b = _mm_loadu_ps(&sinctable[sincTableOffset + 8]); - o = _mm_add_ps(o, _mm_mul_ps(a, b)); + a = SIMD_MM(loadu_ps)(&buffer[readPtr + 8]); + b = SIMD_MM(loadu_ps)(&sinctable[sincTableOffset + 8]); + o = SIMD_MM(add_ps)(o, SIMD_MM(mul_ps)(a, b)); float res; - _mm_store_ss(&res, sst::basic_blocks::mechanics::sum_ps_to_ss(o)); + SIMD_MM(store_ss)(&res, sst::basic_blocks::mechanics::sum_ps_to_ss(o)); return res; } diff --git a/include/sst/basic-blocks/mechanics/simd-ops.h b/include/sst/basic-blocks/mechanics/simd-ops.h index 5227b0d..a9cc743 100644 --- a/include/sst/basic-blocks/mechanics/simd-ops.h +++ b/include/sst/basic-blocks/mechanics/simd-ops.h @@ -23,24 +23,27 @@ * All source in sst-basic-blocks available at * https://github.com/surge-synthesizer/sst-basic-blocks */ + #ifndef INCLUDE_SST_BASIC_BLOCKS_MECHANICS_SIMD_OPS_H #define INCLUDE_SST_BASIC_BLOCKS_MECHANICS_SIMD_OPS_H +#include "sst/basic-blocks/simd/setup.h" + namespace sst::basic_blocks::mechanics { -inline __m128 sum_ps_to_ss(__m128 x) +inline SIMD_M128 sum_ps_to_ss(SIMD_M128 x) { // FIXME: With SSE 3 this can be a dual hadd - __m128 a = _mm_add_ps(x, _mm_movehl_ps(x, x)); - return _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 1))); + auto a = SIMD_MM(add_ps)(x, SIMD_MM(movehl_ps)(x, x)); + return SIMD_MM(add_ss)(a, SIMD_MM(shuffle_ps)(a, a, SIMD_MM_SHUFFLE(0, 0, 0, 1))); } -inline float sum_ps_to_float(__m128 x) +inline float sum_ps_to_float(SIMD_M128 x) { // MSVC can ambiguously resolve this while it still lives in surge vt_dsp alas - __m128 r = sst::basic_blocks::mechanics::sum_ps_to_ss(x); + auto r = sst::basic_blocks::mechanics::sum_ps_to_ss(x); float f; - _mm_store_ss(&f, r); + SIMD_MM(store_ss)(&f, r); return f; } @@ -53,14 +56,14 @@ inline float i2f_binary_cast(int i) } } // namespace detail -const __m128 m128_mask_signbit = _mm_set1_ps(detail::i2f_binary_cast(0x80000000)); -const __m128 m128_mask_absval = _mm_set1_ps(detail::i2f_binary_cast(0x7fffffff)); +const auto m128_mask_signbit = SIMD_MM(set1_ps)(detail::i2f_binary_cast(0x80000000)); +const auto m128_mask_absval = SIMD_MM(set1_ps)(detail::i2f_binary_cast(0x7fffffff)); -inline __m128 abs_ps(__m128 x) { return _mm_and_ps(x, m128_mask_absval); } +inline SIMD_M128 abs_ps(SIMD_M128 x) { return SIMD_MM(and_ps)(x, m128_mask_absval); } inline float rcp(float x) { - _mm_store_ss(&x, _mm_rcp_ss(_mm_load_ss(&x))); + SIMD_MM(store_ss)(&x, SIMD_MM(rcp_ss)(SIMD_MM(load_ss)(&x))); return x; } diff --git a/include/sst/basic-blocks/modulators/ADSREnvelope.h b/include/sst/basic-blocks/modulators/ADSREnvelope.h index b61d54e..f13f65c 100644 --- a/include/sst/basic-blocks/modulators/ADSREnvelope.h +++ b/include/sst/basic-blocks/modulators/ADSREnvelope.h @@ -30,6 +30,7 @@ #include #include #include +#include "sst/basic-blocks/simd/setup.h" #include "DiscreteStagesEnvelope.h" namespace sst::basic_blocks::modulators @@ -209,8 +210,8 @@ struct ADSREnvelope : DiscreteStagesEnvelope const float v_cc = 1.01f; float v_gate = gateActive ? v_cc : 0.f; - // discharge = _mm_and_ps(_mm_or_ps(_mm_cmpgt_ss(v_c1_delayed, one), discharge), - // v_gate); + // discharge = SIMD_MM(and_ps)(SIMD_MM(or_ps)(SIMD_MM(cmpgt_ss)(v_c1_delayed, one), + // discharge), v_gate); discharge = ((v_c1_delayed >= 1) || discharge) && gateActive; v_c1_delayed = v_c1; diff --git a/include/sst/basic-blocks/simd/setup.h b/include/sst/basic-blocks/simd/setup.h new file mode 100644 index 0000000..b11f773 --- /dev/null +++ b/include/sst/basic-blocks/simd/setup.h @@ -0,0 +1,108 @@ +/* + * sst-basic-blocks - an open source library of core audio utilities + * built by Surge Synth Team. + * + * Provides a collection of tools useful on the audio thread for blocks, + * modulation, etc... or useful for adapting code to multiple environments. + * + * Copyright 2023, various authors, as described in the GitHub + * transaction log. Parts of this code are derived from similar + * functions original in Surge or ShortCircuit. + * + * sst-basic-blocks is released under the GNU General Public Licence v3 + * or later (GPL-3.0-or-later). The license is found in the "LICENSE" + * file in the root of this repository, or at + * https://www.gnu.org/licenses/gpl-3.0.en.html. + * + * A very small number of explicitly chosen header files can also be + * used in an MIT/BSD context. Please see the README.md file in this + * repo or the comments in the individual files. Only headers with an + * explicit mention that they are dual licensed may be copied and reused + * outside the GPL3 terms. + * + * All source in sst-basic-blocks available at + * https://github.com/surge-synthesizer/sst-basic-blocks + */ + +#ifndef INCLUDE_SST_BASIC_BLOCKS_SIMD_SETUP_H +#define INCLUDE_SST_BASIC_BLOCKS_SIMD_SETUP_H + +/** + * \page the sst/basic-blocks/simd/setup.h header providces a set of macros and defines + * for simd inclusion which we can use across all of our properties, which conditionally + * sets up the https://github.com/simd-everywhere/simde in various ways, and defines some + * macros which, especiall with the introduction of windows arm64ec builds (which define + * the sse intrinsics, but define them as emulation points) allows us to actually get + * native code on all platforms + * + * This header is controlled by a few important macros + * SST_SIMD_OMIT_NATIVE_ALIASES - on ARM etc platforms do not eject native aliases + * + * This header also defines a few important macros you can use + * + * Programming macros: + * SIMD_M128 -> can be used as an alias for SIMD_M128 in all code in all settings + * SIMD_MM(x) -> creates the appropriate function. Replace SIMD_MM(set1_ps)(2.f) with + * SIMD_MM(set1_ps)(2.f) + * + * from pre-existing _mm code youc an use scripts/fix_simd.pl on a directory + * + * Conditions: + * SST_SIMD_NATIVE_X86 - you are on an x86 / sse2 hardware platform + * SST_SIMD_ARM64EC - you are in microsoft arm64 emulation compatible mode + * SST_SIMD_ARM64 - you are on an arm64 platform without emulation + */ + +#if (defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && _M_IX86_FP >= 2)) +#define SST_SIMD_NATIVE_X86 +#endif + +#if defined(_M_ARM64EC) +#define SST_SIMD_ARM64EC +#endif + +#if defined(__aarch64__) || defined(__arm64) || defined(__arm64__) || defined(_M_ARM64) || \ + defined(_M_ARM64EC) +#define SST_SIMD_ARM64 +#endif + +/* + * Include the appropriate intrinsic header + */ +#ifdef SST_SIMD_ARM64EC +#include +#endif + +#ifdef SST_SIMD_NATIVE_X86 +#include +#endif + +/* + * Include SIMDE + */ +#ifndef SIMDE_UNAVAILABLE +#ifdef SST_SIMD_ARM64EC +#include +#endif + +#ifndef SST_SIMD_NATIVE_X86 +#ifndef SST_SIMD_OMIT_NATIVE_ALIASES +#define SIMDE_ENABLE_NATIVE_ALIASES +#endif +#endif + +#include "simde/x86/sse2.h" +#endif + +#if defined(SST_SIMD_NATIVE_X86) || defined(SIMDE_UNAVAILABLE) +#define SIMD_MM(x) _mm_##x +#define SIMD_M128 __m128 +#define SIMD_MM_SHUFFLE _MM_SHUFFLE +#else +#define SIMD_MM(x) simde_mm_##x +#define SIMD_M128 simde__m128 +#define SIMD_MM_SHUFFLE SIMDE_MM_SHUFFLE +#endif + +#endif // SETUP_H diff --git a/scripts/fix_simd.pl b/scripts/fix_simd.pl new file mode 100755 index 0000000..292dd4e --- /dev/null +++ b/scripts/fix_simd.pl @@ -0,0 +1,48 @@ +#!/usr/bin/perl + + +use File::Find; +use File::Basename; + +print("Running find on $ARGV[0]"); + +find( + { + wanted => \&findfiles, + }, + $ARGV[0] +); + + +sub findfiles +{ + $f = $File::Find::name; + + + if ($f =~ m/\.h$/ or $f =~ m/.cpp$/) + { + print $f . "\n"; + + $src = $_; + open(IN, "< $_"); + open(OUT, "> $_.bak"); + while () + { + # Replacement one. __m128 foo = bar -> auto foo = bar + s/__m128 (\S+) =/auto $1 =/g; + + # Replacement two. Any other __m128 goes to SIMD_M128 + s/__m128 /SIMD_M128 /g; + + # Replacement three. Any _mm_foo_bar goes to SIMD_MM(foo_bar) + s/_mm_([^\(]+)\(/SIMD_MM($1)(/g; + + s/_MM_SHUFFLE/SIMD_MM_SHUFFLE/g; + print OUT; + } + close(OUT); + close(IN); + system("mv $src.bak $src"); + # die "foo"; + } +} \ No newline at end of file diff --git a/tests/block_tests.cpp b/tests/block_tests.cpp index 66faaa3..b1b6041 100644 --- a/tests/block_tests.cpp +++ b/tests/block_tests.cpp @@ -25,7 +25,7 @@ */ #include "catch2.hpp" -#include "smoke_test_sse.h" +#include "sst/basic-blocks/simd/setup.h" #include "sst/basic-blocks/mechanics/block-ops.h" diff --git a/tests/dsp_tests.cpp b/tests/dsp_tests.cpp index 71d766e..5e39446 100644 --- a/tests/dsp_tests.cpp +++ b/tests/dsp_tests.cpp @@ -25,7 +25,7 @@ */ #include "catch2.hpp" -#include "smoke_test_sse.h" +#include "sst/basic-blocks/simd/setup.h" #include #include #include @@ -431,13 +431,13 @@ TEST_CASE("Check FastMath Functions", "[dsp]") for (float x = -4.9; x < 4.9; x += 0.02) { INFO("Testing unclamped at " << x); - auto q = _mm_set_ps1(x); + auto q = SIMD_MM(set_ps1)(x); auto r = sst::basic_blocks::dsp::fasttanhSSE(q); auto rn = tanh(x); auto rd = sst::basic_blocks::dsp::fasttanh(x); union { - __m128 v; + SIMD_M128 v; float a[4]; } U; U.v = r; @@ -448,12 +448,12 @@ TEST_CASE("Check FastMath Functions", "[dsp]") for (float x = -10; x < 10; x += 0.02) { INFO("Testing clamped at " << x); - auto q = _mm_set_ps1(x); + auto q = SIMD_MM(set_ps1)(x); auto r = sst::basic_blocks::dsp::fasttanhSSEclamped(q); auto cn = tanh(x); union { - __m128 v; + SIMD_M128 v; float a[4]; } U; U.v = r; @@ -478,13 +478,13 @@ TEST_CASE("Check FastMath Functions", "[dsp]") for (float x = -3.9; x < 2.9; x += 0.02) { INFO("Testing fastexp at " << x); - auto q = _mm_set_ps1(x); + auto q = SIMD_MM(set_ps1)(x); auto r = sst::basic_blocks::dsp::fastexpSSE(q); auto rn = exp(x); auto rd = sst::basic_blocks::dsp::fastexp(x); union { - __m128 v; + SIMD_M128 v; float a[4]; } U; U.v = r; @@ -507,13 +507,13 @@ TEST_CASE("Check FastMath Functions", "[dsp]") for (float x = -3.14; x < 3.14; x += 0.02) { INFO("Testing unclamped at " << x); - auto q = _mm_set_ps1(x); + auto q = SIMD_MM(set_ps1)(x); auto r = sst::basic_blocks::dsp::fastsinSSE(q); auto rn = sin(x); auto rd = sst::basic_blocks::dsp::fastsin(x); union { - __m128 v; + SIMD_M128 v; float a[4]; } U; U.v = r; @@ -528,13 +528,13 @@ TEST_CASE("Check FastMath Functions", "[dsp]") for (float x = -3.14; x < 3.14; x += 0.02) { INFO("Testing unclamped at " << x); - auto q = _mm_set_ps1(x); + auto q = SIMD_MM(set_ps1)(x); auto r = sst::basic_blocks::dsp::fastcosSSE(q); auto rn = cos(x); auto rd = sst::basic_blocks::dsp::fastcos(x); union { - __m128 v; + SIMD_M128 v; float a[4]; } U; U.v = r; @@ -548,12 +548,12 @@ TEST_CASE("Check FastMath Functions", "[dsp]") { for (float f = -800.7; f < 816.4; f += 0.245) { - auto fs = _mm_set_ps1(f); + auto fs = SIMD_MM(set_ps1)(f); auto q = sst::basic_blocks::dsp::clampToPiRangeSSE(fs); union { - __m128 v; + SIMD_M128 v; float a[4]; } U; U.v = q; @@ -573,9 +573,9 @@ TEST_CASE("SoftClip", "[dsp]") r[2] = 0.6; r[3] = 1.7; - auto v = _mm_load_ps(r); + auto v = SIMD_MM(load_ps)(r); auto c = sst::basic_blocks::dsp::softclip_ps(v); - _mm_store_ps(r, c); + SIMD_MM(store_ps)(r, c); REQUIRE(r[0] == Approx(-1.0).margin(0.0001)); REQUIRE(r[1] == Approx(-0.8 - 4.f / 27.f * pow(-0.8, 3)).margin(0.0001)); REQUIRE(r[2] == Approx(0.6 - 4.f / 27.f * pow(0.6, 3)).margin(0.0001)); diff --git a/tests/modulator_tests.cpp b/tests/modulator_tests.cpp index 53672fb..37401ea 100644 --- a/tests/modulator_tests.cpp +++ b/tests/modulator_tests.cpp @@ -25,7 +25,7 @@ */ #include "catch2.hpp" -#include "smoke_test_sse.h" +#include "sst/basic-blocks/simd/setup.h" #include "sst/basic-blocks/modulators/FXModControl.h" namespace smod = sst::basic_blocks::modulators; diff --git a/tests/param_tests.cpp b/tests/param_tests.cpp index ca06dd3..cc910fc 100644 --- a/tests/param_tests.cpp +++ b/tests/param_tests.cpp @@ -25,7 +25,7 @@ */ #include "catch2.hpp" -#include "smoke_test_sse.h" +#include "sst/basic-blocks/simd/setup.h" #include #include #include diff --git a/tests/run_envelopes.cpp b/tests/run_envelopes.cpp index fb6f16b..4d42d74 100644 --- a/tests/run_envelopes.cpp +++ b/tests/run_envelopes.cpp @@ -29,7 +29,7 @@ #include #include #include -#include "smoke_test_sse.h" +#include "sst/basic-blocks/simd/setup.h" #include "sst/basic-blocks/dsp/CorrelatedNoise.h" #include "sst/basic-blocks/dsp/Interpolators.h" diff --git a/tests/simd_tests.cpp b/tests/simd_tests.cpp index 673874a..6d846a4 100644 --- a/tests/simd_tests.cpp +++ b/tests/simd_tests.cpp @@ -25,24 +25,24 @@ */ #include "catch2.hpp" -#include "smoke_test_sse.h" +#include "sst/basic-blocks/simd/setup.h" #include "sst/basic-blocks/mechanics/simd-ops.h" #include TEST_CASE("abs_ps", "[simd]") { - auto p1 = _mm_set1_ps(13.2); - auto p2 = _mm_set1_ps(-142.3); + auto p1 = SIMD_MM(set1_ps)(13.2); + auto p2 = SIMD_MM(set1_ps)(-142.3); auto ap1 = sst::basic_blocks::mechanics::abs_ps(p1); auto ap2 = sst::basic_blocks::mechanics::abs_ps(p2); float res alignas(16)[4]; - _mm_store_ps(res, ap1); + SIMD_MM(store_ps)(res, ap1); REQUIRE(res[0] == Approx(13.2).margin(0.0001)); - _mm_store_ps(res, ap2); + SIMD_MM(store_ps)(res, ap2); REQUIRE(res[0] == Approx(142.3).margin(0.00001)); } @@ -53,6 +53,6 @@ TEST_CASE("Sums", "[simd]") res[1] = 0.2; res[2] = 0.3; res[3] = 0.4; - auto val = _mm_load_ps(res); + auto val = SIMD_MM(load_ps)(res); REQUIRE(sst::basic_blocks::mechanics::sum_ps_to_float(val) == Approx(1.0).margin(0.00001)); } \ No newline at end of file diff --git a/tests/smoke_test_sse.h b/tests/smoke_test_sse.h deleted file mode 100644 index b008097..0000000 --- a/tests/smoke_test_sse.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * sst-basic-blocks - an open source library of core audio utilities - * built by Surge Synth Team. - * - * Provides a collection of tools useful on the audio thread for blocks, - * modulation, etc... or useful for adapting code to multiple environments. - * - * Copyright 2023, various authors, as described in the GitHub - * transaction log. Parts of this code are derived from similar - * functions original in Surge or ShortCircuit. - * - * sst-basic-blocks is released under the GNU General Public Licence v3 - * or later (GPL-3.0-or-later). The license is found in the "LICENSE" - * file in the root of this repository, or at - * https://www.gnu.org/licenses/gpl-3.0.en.html. - * - * A very small number of explicitly chosen header files can also be - * used in an MIT/BSD context. Please see the README.md file in this - * repo or the comments in the individual files. Only headers with an - * explicit mention that they are dual licensed may be copied and reused - * outside the GPL3 terms. - * - * All source in sst-basic-blocks available at - * https://github.com/surge-synthesizer/sst-basic-blocks - */ - -#ifndef SST_BASIC_BLOCK_TESTS_SMOKE_TEST_SSE_H -#define SST_BASIC_BLOCK_TESTS_SMOKE_TEST_SSE_H - -#if defined(__arm64__) -#define SIMDE_ENABLE_NATIVE_ALIASES -#include "simde/x86/sse2.h" -#else -#include -#endif - -#endif // SHORTCIRCUITXT_SMOKE_TEST_SSE_H diff --git a/tests/smoketest.cpp b/tests/smoketest.cpp index 34366f1..73d9403 100644 --- a/tests/smoketest.cpp +++ b/tests/smoketest.cpp @@ -30,8 +30,7 @@ #include "catch2.hpp" #include -#include "smoke_test_sse.h" - +#include "sst/basic-blocks/simd/setup.h" #include "sst/basic-blocks/dsp/CorrelatedNoise.h" #include "sst/basic-blocks/dsp/Interpolators.h" #include "sst/basic-blocks/dsp/BlockInterpolators.h" diff --git a/tests/table_tests.cpp b/tests/table_tests.cpp index 9d13ea3..f3cb7c2 100644 --- a/tests/table_tests.cpp +++ b/tests/table_tests.cpp @@ -25,7 +25,7 @@ */ #include "catch2.hpp" -#include "smoke_test_sse.h" +#include "sst/basic-blocks/simd/setup.h" #include "sst/basic-blocks/tables/DbToLinearProvider.h" #include "sst/basic-blocks/tables/EqualTuningProvider.h"