diff --git a/source/inteli/avx2intrin.d b/source/inteli/avx2intrin.d index c6d545e..2929d22 100644 --- a/source/inteli/avx2intrin.d +++ b/source/inteli/avx2intrin.d @@ -1675,19 +1675,7 @@ __m256i _mm256_max_epu16 (__m256i a, __m256i b) pure @trusted return cast(__m256i)( (greater & sa) | (~greater & sb) ); } else - { - // good with GDC, but not LDC - short16 sa = cast(short16)a; - short16 sb = cast(short16)b; - short16 r; - for (int n = 0; n < 16; ++n) - { - ushort ua = sa.array[n]; - ushort ub = sb.array[n]; - r.ptr[n] = ua > ub ? ua : ub; - } - return cast(__m256i)r; - } + static assert(false); } unittest { @@ -1836,7 +1824,51 @@ unittest // TODO __m256i _mm256_min_epi8 (__m256i a, __m256i b) pure @safe -// TODO __m256i _mm256_min_epu16 (__m256i a, __m256i b) pure @safe + +/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed minimum values. +__m256i _mm256_min_epu16 (__m256i a, __m256i b) pure @trusted +{ + // PERF D_SIMD + version(GNU) + enum bool split = true; + else static if (SIMD_COMPARISON_MASKS_32B) + enum bool split = false; + else + enum bool split = true; + + static if (GDC_with_AVX2) + { + return cast(__m256i) __builtin_ia32_pminuw256(cast(short16)a, cast(short16)b); + } + else static if (split) + { + // split + __m128i a_lo = _mm256_extractf128_si256!0(a); + __m128i a_hi = _mm256_extractf128_si256!1(a); + __m128i b_lo = _mm256_extractf128_si256!0(b); + __m128i b_hi = _mm256_extractf128_si256!1(b); + __m128i r_lo = _mm_min_epu16(a_lo, b_lo); + __m128i r_hi = _mm_min_epu16(a_hi, b_hi); + return _mm256_set_m128i(r_hi, r_lo); + } + else static if (SIMD_COMPARISON_MASKS_32B) + { + // ??? catastrophic with GDC x86_64, good with LDC + short16 sa = cast(short16)a; + short16 sb = cast(short16)b; + short16 greater = cast(short16)(cast(ushort16)sa > cast(ushort16)sb); + return cast(__m256i)( (~greater & sa) | (greater & sb) ); + } + else + static assert(false); +} +unittest +{ + short16 R = cast(short16) _mm256_min_epu16(_mm256_setr_epi16(32767, 1, -4, -8, 9, 7, 0,-57, 1, 0, 0, 0, 1, 0, 0, -6), + _mm256_setr_epi16( -4, -8, 9, 7, 0,-32768, 0, 0, 0, 2, 0, 4, 2, 1, 2, -4)); + short[16] correct = [32767, 1, 9, 7, 0, 7, 0, 0, 0, 0, 0, 0, 1, 0, 0, -6]; + assert(R.array == correct); +} /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed minimum values. __m256i _mm256_min_epu32 (__m256i a, __m256i b) pure @safe diff --git a/source/inteli/smmintrin.d b/source/inteli/smmintrin.d index e199e3d..e9871fe 100644 --- a/source/inteli/smmintrin.d +++ b/source/inteli/smmintrin.d @@ -1427,7 +1427,7 @@ unittest } /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. -__m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted +__m128i _mm_min_epu16 (__m128i a, __m128i b) pure @trusted { // PERF DMD static if (GDC_with_SSE41)