Skip to content

Commit

Permalink
Remove AVX-512 presence enums.
Browse files Browse the repository at this point in the history
Add first two F16C intrinsics (only generic path for now, taken from stb_image_resize2.h).
  • Loading branch information
Guillaume Piolat committed Nov 25, 2024
1 parent 95c610c commit a9cb0e6
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 99 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
| SSE4.2| Yes but ([#42](https://github.com/AuburnSounds/intel-intrinsics/issues/42)) | Yes (`-mattr=+sse4.2`) | Yes (`-mattr=+crc`) | Yes (`-msse4.2`) |
| BMI2 | Yes but ([#42](https://github.com/AuburnSounds/intel-intrinsics/issues/42)) | Yes (`-mattr=+bmi2`) | Yes | Yes (`-mbmi2`) |
| AVX | Yes but ([#42](https://github.com/AuburnSounds/intel-intrinsics/issues/42)) | Yes (`-mattr=+avx`) | Yes | Yes (`-mavx`) |
| F16C | WIP, ([#42](https://github.com/AuburnSounds/intel-intrinsics/issues/42)) | WIP (`-mattr=+f16c`) | WIP | WIP (`-mf16c`) |
| AVX2 | WIP and ([#42](https://github.com/AuburnSounds/intel-intrinsics/issues/42)) | WIP (`-mattr=+avx2`) | WIP | WIP (`-mavx2`) |

The intrinsics implemented follow the syntax and semantics at: https://software.intel.com/sites/landingpage/IntrinsicsGuide/
Expand Down
91 changes: 89 additions & 2 deletions source/inteli/avxintrin.d
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* AVX intrinsics.
* AVX and FP16C intrinsics.
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX
*
* Copyright: Guillaume Piolat 2022.
Expand All @@ -10,13 +10,19 @@
module inteli.avxintrin;

// AVX instructions
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=AVX
// Note: this header will work whether you have AVX enabled or not.
// With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively
// generate AVX instructions.
// With GDC, use "dflags-gdc": ["-mavx"] or equivalent to actively
// generate AVX instructions.

// This header also implements FP16C intrinsics.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=F16C
// With LDC, use "dflags-ldc": ["-mattr=+f16c"] or equivalent to actively
// generate F16C instructions.
// With GDC, use "dflags-gdc": ["-mf16c"] or equivalent to actively
// generate F16C instructions.

/// IMPORTANT NOTE ABOUT MASK LOAD/STORE:
///
Expand Down Expand Up @@ -4889,3 +4895,84 @@ unittest
long[4] correct = [-1, 99, 0, 0];
assert(R.array == correct);
}


// F16C start here

/// Convert 4 packed half-precision (16-bit) floating-point elements
/// in `a` to packed single-precision (32-bit) floating-point elements.
/// Note: Only lowest 64-bit of input considered.
/// Preserve infinities, sign of zeroes, and NaN-ness.
__m128 _mm_cvtph_ps(__m128i a) pure @trusted
{
short8 sa = cast(short8)a;

// PERF F16C actual instruction
{
// Reference: stb_image_resize2.h has F16C emulation.
// See:
// Originated from:
__m128i mask_nosign = _mm_set1_epi32(0x7fff);
__m128i smallest_normal = _mm_set1_epi32(0x0400);
__m128i infinity = _mm_set1_epi32(0x7c00);
__m128i expadjust_normal = _mm_set1_epi32((127 - 15) << 23);
__m128i magic_denorm = _mm_set1_epi32(113 << 23);
__m128i i = a;
__m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() );
__m128i mnosign = mask_nosign;
__m128i eadjust = expadjust_normal;
__m128i smallest = smallest_normal;
__m128i infty = infinity;
__m128i expmant = _mm_and_si128(mnosign, h);
__m128i justsign = _mm_xor_si128(h, expmant);
__m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
__m128i b_isdenorm = _mm_cmpgt_epi32(smallest, expmant);
__m128i shifted = _mm_slli_epi32(expmant, 13);
__m128i adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust);
__m128i adjusted = _mm_add_epi32(eadjust, shifted);
__m128i den1 = _mm_add_epi32(shifted, magic_denorm);
__m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan);
__m128 den2 = _mm_sub_ps(cast(__m128)den1, *cast(const(__m128)*)&magic_denorm);
__m128 adjusted3 = _mm_and_ps(den2, cast(__m128)b_isdenorm);
__m128 adjusted4 = _mm_andnot_ps(cast(__m128)b_isdenorm, cast(__m128)adjusted2);
__m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4);
__m128i sign = _mm_slli_epi32(justsign, 16);
__m128 final_ = _mm_or_ps(adjusted5, cast(__m128)sign);
return final_;
}
}
unittest
{
__m128i A = _mm_setr_epi16(cast(short)0x8000, 0x7C00, cast(short)0xDA90, 0x5000, 0, 0, 0, 0);
float[4] correct = [-0.0f, float.infinity, -210.0f, 32.0f];
__m128 R = _mm_cvtph_ps(A);
assert(R.array == correct);
}

/// Convert 8 packed half-precision (16-bit) floating-point elements
/// in `a` to packed single-precision (32-bit) floating-point elements.
/// Note: Preserve infinities, sign of zeroes, and NaN-ness.
__m256 _mm256_cvtph_ps(__m128i a) pure @trusted
{
// PERF F16C actual instruction
{
// In stb_image_resize2.h, _mm_cvtph_ps is simply hand-inlined 2x
// so we do the same here.
int4 ihi;
ihi.ptr[0] = a.array[2];
ihi.ptr[1] = a.array[3];
__m128 lo = _mm_cvtph_ps(a);
__m128 hi = _mm_cvtph_ps(ihi);
return _mm256_set_m128(hi, lo);
}
}
unittest
{
__m128i A = _mm_setr_epi16(0, cast(short)-32768, 0, cast(short)0xFC00, 0x7C00, 0x5A90,cast(short)0xDA90, 0x5000);
float[8] correct = [0.0f, -0.0f, 0.0f, -float.infinity, float.infinity, 210.0f, -210.0f, 32.0f];
__m256 R = _mm256_cvtph_ps(A);
assert(R.array == correct);
}

// __m128i _mm_cvtps_ph (__m128 a, int imm8) TODO
// __m128i _mm256_cvtps_ph (__m256 a, int imm8) TODO
102 changes: 5 additions & 97 deletions source/inteli/internals.d
Original file line number Diff line number Diff line change
Expand Up @@ -180,28 +180,10 @@ version(LDC)
enum LDC_with_SSE42 = false;
enum LDC_with_CRC32 = false;
enum LDC_with_AVX = false;
enum LDC_with_F16C = false;
enum LDC_with_AVX2 = false;
enum LDC_with_SHA = false;
enum LDC_with_BMI2 = false;

enum LDC_with_AVX512F = false;
enum LDC_with_AVX512CD = false;
enum LDC_with_AVX512ER = false;
enum LDC_with_AVX512PF = false;
enum LDC_with_AVX512BW = false;
enum LDC_with_AVX512DQ = false;
enum LDC_with_AVX512VL = false;
enum LDC_with_AVX512IFMA = false;
enum LDC_with_AVX512VBMI = false;
enum LDC_with_AVX512VBMI2 = false;

enum LDC_with_AVX512FP16 = false;
enum LDC_with_AVX512BF16 = false;
enum LDC_with_AVX512VNNI = false;
enum LDC_with_AVX512BITALG = false;

enum LDC_with_AVX512VP2INTERSECT = false;
enum LDC_with_AVX512VPOPCNTDQ = false;
}
else version(AArch64)
{
Expand All @@ -217,28 +199,10 @@ version(LDC)
enum LDC_with_SSE42 = false;
enum LDC_with_CRC32 = false;
enum LDC_with_AVX = false;
enum LDC_with_F16C = false;
enum LDC_with_AVX2 = false;
enum LDC_with_SHA = false;
enum LDC_with_BMI2 = false;

enum LDC_with_AVX512F = false;
enum LDC_with_AVX512CD = false;
enum LDC_with_AVX512ER = false;
enum LDC_with_AVX512PF = false;
enum LDC_with_AVX512BW = false;
enum LDC_with_AVX512DQ = false;
enum LDC_with_AVX512VL = false;
enum LDC_with_AVX512IFMA = false;
enum LDC_with_AVX512VBMI = false;
enum LDC_with_AVX512VBMI2 = false;

enum LDC_with_AVX512FP16 = false;
enum LDC_with_AVX512BF16 = false;
enum LDC_with_AVX512VNNI = false;
enum LDC_with_AVX512BITALG = false;

enum LDC_with_AVX512VP2INTERSECT = false;
enum LDC_with_AVX512VPOPCNTDQ = false;
}
else static if (some_x86)
{
Expand Down Expand Up @@ -276,30 +240,10 @@ version(LDC)
}

enum LDC_with_AVX = __traits(targetHasFeature, "avx") && LDC_with_ia32_builtins;
enum LDC_with_F16C = __traits(targetHasFeature, "f16c") && LDC_with_ia32_builtins;
enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2") && LDC_with_ia32_builtins;
enum LDC_with_SHA = __traits(targetHasFeature, "sha") && LDC_with_ia32_builtins;
enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2") && LDC_with_ia32_builtins;

// All of the feature flags at https://github.com/cetio/sim-d/blob/main/source/simd/features.d
// but I haven't added them because I don't feel inclined, these suffice.
enum LDC_with_AVX512F = __traits(targetHasFeature, "avx512f") && LDC_with_ia32_builtins;
enum LDC_with_AVX512CD = __traits(targetHasFeature, "avx512cd") && LDC_with_ia32_builtins;
enum LDC_with_AVX512ER = __traits(targetHasFeature, "avx512er") && LDC_with_ia32_builtins;
enum LDC_with_AVX512PF = __traits(targetHasFeature, "avx512pf") && LDC_with_ia32_builtins;
enum LDC_with_AVX512BW = __traits(targetHasFeature, "avx512bw") && LDC_with_ia32_builtins;
enum LDC_with_AVX512DQ = __traits(targetHasFeature, "avx512dq") && LDC_with_ia32_builtins;
enum LDC_with_AVX512VL = __traits(targetHasFeature, "avx512vl") && LDC_with_ia32_builtins;
enum LDC_with_AVX512IFMA = __traits(targetHasFeature, "avx512ifma") && LDC_with_ia32_builtins;
enum LDC_with_AVX512VBMI = __traits(targetHasFeature, "avx512vbmi") && LDC_with_ia32_builtins;
enum LDC_with_AVX512VBMI2 = __traits(targetHasFeature, "avx512vbmi2") && LDC_with_ia32_builtins;

enum LDC_with_AVX512FP16 = (__VERSION__ > 2101) && __traits(targetHasFeature, "avx512fp16") && LDC_with_ia32_builtins;
enum LDC_with_AVX512BF16 = __traits(targetHasFeature, "avx512bf16") && LDC_with_ia32_builtins;
enum LDC_with_AVX512VNNI = __traits(targetHasFeature, "avx512vnni") && LDC_with_ia32_builtins;
enum LDC_with_AVX512BITALG = __traits(targetHasFeature, "avx512bitalg") && LDC_with_ia32_builtins;

enum LDC_with_AVX512VP2INTERSECT = __traits(targetHasFeature, "avx512vp2intersect") && LDC_with_ia32_builtins;
enum LDC_with_AVX512VPOPCNTDQ = __traits(targetHasFeature, "avx512vpopcntdq") && LDC_with_ia32_builtins;
}
else
{
Expand All @@ -314,28 +258,10 @@ version(LDC)
enum LDC_with_SSE42 = false;
enum LDC_with_CRC32 = false;
enum LDC_with_AVX = false;
enum LDC_with_F16C = false;
enum LDC_with_AVX2 = false;
enum LDC_with_SHA = false;
enum LDC_with_BMI2 = false;

enum LDC_with_AVX512F = false;
enum LDC_with_AVX512CD = false;
enum LDC_with_AVX512ER = false;
enum LDC_with_AVX512PF = false;
enum LDC_with_AVX512BW = false;
enum LDC_with_AVX512DQ = false;
enum LDC_with_AVX512VL = false;
enum LDC_with_AVX512IFMA = false;
enum LDC_with_AVX512VBMI = false;
enum LDC_with_AVX512VBMI2 = false;

enum LDC_with_AVX512FP16 = false;
enum LDC_with_AVX512BF16 = false;
enum LDC_with_AVX512VNNI = false;
enum LDC_with_AVX512BITALG = false;

enum LDC_with_AVX512VP2INTERSECT = false;
enum LDC_with_AVX512VPOPCNTDQ = false;
}

// Should we use inline x86 assembly with DMD syntax, in LDC?
Expand Down Expand Up @@ -368,29 +294,11 @@ else
enum LDC_with_SSE42 = false;
enum LDC_with_CRC32 = false;
enum LDC_with_AVX = false;
enum LDC_with_F16C = false;
enum LDC_with_AVX2 = false;
enum LDC_with_SHA = false;
enum LDC_with_BMI2 = false;

enum LDC_with_AVX512F = false;
enum LDC_with_AVX512CD = false;
enum LDC_with_AVX512ER = false;
enum LDC_with_AVX512PF = false;
enum LDC_with_AVX512BW = false;
enum LDC_with_AVX512DQ = false;
enum LDC_with_AVX512VL = false;
enum LDC_with_AVX512IFMA = false;
enum LDC_with_AVX512VBMI = false;
enum LDC_with_AVX512VBMI2 = false;

enum LDC_with_AVX512FP16 = false;
enum LDC_with_AVX512BF16 = false;
enum LDC_with_AVX512VNNI = false;
enum LDC_with_AVX512BITALG = false;

enum LDC_with_AVX512VP2INTERSECT = false;
enum LDC_with_AVX512VPOPCNTDQ = false;

enum LDC_with_InlineIREx = false;
enum bool LDC_with_optimizations = false;
enum bool LDC_with_32b_x86_asm = false;
Expand Down

0 comments on commit a9cb0e6

Please sign in to comment.