refactor(core): use avx512 intrinsics when available for data convers…

…ions - we use inline assembly for now as rust does not propose those in the std or core arch crates at the moment
zama-ai · Oct 3, 2023 · b0fa05c · b0fa05c
1 parent fad066a
commit b0fa05c
Showing 1 changed file with 64 additions and 69 deletions.
diff --git a/tfhe/src/core_crypto/fft_impl/fft64/math/fft/x86.rs b/tfhe/src/core_crypto/fft_impl/fft64/math/fft/x86.rs
@@ -88,66 +88,46 @@ pub fn mm256_cvtpd_epi64(simd: V3, x: __m256d) -> __m256i {
 }
 
 /// Convert a vector of f64 values to a vector of i64 values.
-/// See `f64_to_i64_bit_twiddles` in `fft/tests.rs` for the scalar version.
+/// This intrinsics is currently not available in rust, so we have our own implementation using
+/// inline assembly.
+///
+/// The name matches Intel's convention (re-used by rust in their intrinsics) without the leading
+/// `_`.
+///
+/// [`Intel's documentation`](`https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtpd_epi64`)
 #[cfg(feature = "nightly-avx512")]
 #[inline(always)]
 pub fn mm512_cvtpd_epi64(simd: V4, x: __m512d) -> __m512i {
-    let avx = simd.avx512f;
-
-    // reinterpret the bits as u64 values
-    let bits = avx._mm512_castpd_si512(x);
-    // mask that covers the first 52 bits
-    let mantissa_mask = avx._mm512_set1_epi64(0xFFFFFFFFFFFFF_u64 as i64);
-    // mask that covers the 53rd bit
-    let explicit_mantissa_bit = avx._mm512_set1_epi64(0x10000000000000_u64 as i64);
-    // mask that covers the first 11 bits
-    let exp_mask = avx._mm512_set1_epi64(0x7FF_u64 as i64);
-
-    // extract the first 52 bits and add the implicit bit
-    let mantissa = avx._mm512_or_si512(
-        avx._mm512_and_si512(bits, mantissa_mask),
-        explicit_mantissa_bit,
-    );
-
-    // extract the 52nd to 63rd (excluded) bits for the biased exponent
-    let biased_exp = avx._mm512_and_si512(avx._mm512_srli_epi64::<52>(bits), exp_mask);
-
-    // extract the 63rd sign bit
-    let sign_is_negative_mask =
-        avx._mm512_cmpeq_epi64_mask(avx._mm512_srli_epi64::<63>(bits), avx._mm512_set1_epi64(1));
-
-    // we need to shift the mantissa by some value that may be negative, so we first shift it to
-    // the left by the maximum amount, then shift it to the right by our value plus the offset we
-    // just shifted by
-    //
-    // the 53rd bit is set to 1, so we shift to the left by 10 so the 63rd (last) bit is set.
-    let mantissa_lshift = avx._mm512_slli_epi64::<11>(mantissa);
-
-    // shift to the right and apply the exponent bias
-    let mantissa_shift = avx._mm512_srlv_epi64(
-        mantissa_lshift,
-        avx._mm512_sub_epi64(avx._mm512_set1_epi64(1086), biased_exp),
-    );
-
-    // if the sign bit is unset, we keep our result
-    let value_if_positive = mantissa_shift;
-    // otherwise, we negate it
-    let value_if_negative = avx._mm512_sub_epi64(avx._mm512_setzero_si512(), value_if_positive);
-
-    // if the biased exponent is all zeros, we have a subnormal value (or zero)
-
-    // if it is not subnormal, we keep our results
-    let value_if_non_subnormal =
-        avx._mm512_mask_blend_epi64(sign_is_negative_mask, value_if_positive, value_if_negative);
-
-    // if it is subnormal, the conversion to i64 (rounding towards zero) returns zero
-    let value_if_subnormal = avx._mm512_setzero_si512();
-
-    // compare the biased exponent to a zero value
-    let is_subnormal = avx._mm512_cmpeq_epi64_mask(biased_exp, avx._mm512_setzero_si512());
+    #[target_feature(enable = "avx512dq")]
+    #[inline]
+    unsafe fn implementation(x: __m512d) -> __m512i {
+        let mut as_i64x8: __m512i;
+
+        // From Intel's documentation the syntax to use this intrinsics is
+        // Instruction: vcvtpd2qq zmm, zmm
+        // With Intel syntax, left operand is the destination, right operand is the source
+        // For the asm! macro
+        // in: indicates an input register
+        // out: indicates an output register
+        // zmm_reg: the avx512 register type
+        // options: see https://doc.rust-lang.org/nightly/reference/inline-assembly.html#options
+        // pure: no side effect
+        // nomem: does not reference RAM (only registers)
+        // nostrack: does not alter the state of the stack
+        core::arch::asm!(
+            "vcvtpd2qq {dst}, {src}",
+            src = in(zmm_reg) x,
+            dst = out(zmm_reg) as_i64x8,
+            options(pure, nomem, nostack)
+        );
+
+        as_i64x8
+    }
+    let _ = simd.avx512dq;
 
-    // choose the result depending on subnormalness
-    avx._mm512_mask_blend_epi64(is_subnormal, value_if_non_subnormal, value_if_subnormal)
+    // SAFETY: simd contains an instance of avx512dq, that matches the target feature of
+    // `implementation`
+    unsafe { implementation(x) }
 }
 
 /// Convert a vector of i64 values to a vector of f64 values. Not sure how it works.
@@ -177,25 +157,40 @@ pub fn mm256_cvtepi64_pd(simd: V3, x: __m256i) -> __m256d {
 }
 
 /// Convert a vector of i64 values to a vector of f64 values.
+/// This intrinsics is currently not available in rust, so we have our own implementation using
+/// inline assembly.
+///
+/// The name matches Intel's convention (re-used by rust in their intrinsics) without the leading
+/// `_`.
+///
+/// [`Intel's documentation`](`https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtepi64_pd`)
 #[cfg(feature = "nightly-avx512")]
 #[inline(always)]
 pub fn mm512_cvtepi64_pd(simd: V4, x: __m512i) -> __m512d {
     #[target_feature(enable = "avx512dq")]
     #[inline]
     unsafe fn implementation(x: __m512i) -> __m512d {
-        // hopefully this compiles to vcvtqq2pd
-        let i64x8: [i64; 8] = core::mem::transmute(x);
-        let as_f64x8 = [
-            i64x8[0] as f64,
-            i64x8[1] as f64,
-            i64x8[2] as f64,
-            i64x8[3] as f64,
-            i64x8[4] as f64,
-            i64x8[5] as f64,
-            i64x8[6] as f64,
-            i64x8[7] as f64,
-        ];
-        core::mem::transmute(as_f64x8)
+        let mut as_f64x8: __m512d;
+
+        // From Intel's documentation the syntax to use this intrinsics is
+        // Instruction: vcvtqq2pd zmm, zmm
+        // With Intel syntax, left operand is the destination, right operand is the source
+        // For the asm! macro
+        // in: indicates an input register
+        // out: indicates an output register
+        // zmm_reg: the avx512 register type
+        // options: see https://doc.rust-lang.org/nightly/reference/inline-assembly.html#options
+        // pure: no side effect
+        // nomem: does not reference RAM (only registers)
+        // nostrack: does not alter the state of the stack
+        core::arch::asm!(
+            "vcvtqq2pd {dst}, {src}",
+            src = in(zmm_reg) x,
+            dst = out(zmm_reg) as_f64x8,
+            options(pure, nomem, nostack)
+        );
+
+        as_f64x8
     }
     let _ = simd.avx512dq;