EricLBuehler · EricLBuehler · Nov 15, 2024 · Jan 12, 2025 · Jan 12, 2025 · Jan 13, 2025
diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml
@@ -20,6 +20,7 @@ gemm = { workspace = true }
 half = { workspace = true }
 float8 = { workspace = true }
 intel-mkl-src = { workspace = true, optional = true }
+itertools = "0.12.1"
 libc = { workspace = true, optional = true }
 memmap2 = { workspace = true }
 num-traits = { workspace = true }
@@ -46,6 +47,7 @@ nccl = ["cuda", "cudarc/nccl"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
 accelerate = ["dep:libc", "dep:accelerate-src"]
 metal = ["dep:metal", "dep:candle-metal-kernels"]
+arm-nightly-feat = []
 
 [[bench]]
 name = "bench_main"

diff --git a/candle-core/src/lib.rs b/candle-core/src/lib.rs
@@ -47,6 +47,11 @@
 //! - [candle-transformers](https://docs.rs/candle-transformers/). Candle implemntation of many published transformer models.
 //!
 
+#![cfg_attr(feature = "arm-nightly-feat", feature(stdarch_neon_dotprod))]
+#![cfg_attr(feature = "arm-nightly-feat", feature(array_chunks))]
+#![cfg_attr(feature = "arm-nightly-feat", feature(stdarch_neon_i8mm))]
+#![cfg_attr(feature = "arm-nightly-feat", feature(portable_simd))]
+
 #[cfg(feature = "accelerate")]
 mod accelerate;
 pub mod backend;

diff --git a/candle-core/src/quantized/avx.rs b/candle-core/src/quantized/avx.rs
@@ -1,5 +1,6 @@
 use super::k_quants::{
-    BlockQ2K, BlockQ3K, BlockQ4K, BlockQ4_0, BlockQ5K, BlockQ6K, BlockQ8K, BlockQ8_0, QK8_0, QK_K,
+    BlockF8Q8, BlockQ2K, BlockQ3K, BlockQ4K, BlockQ4_0, BlockQ5K, BlockQ6K, BlockQ8K, BlockQ8_0,
+    QK8_0, QK_K,
 };
 use crate::Result;
 use byteorder::{ByteOrder, LittleEndian};
@@ -87,6 +88,25 @@ pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) ->
     }
 }
 
+#[inline(always)]
+pub(crate) fn vec_dot_f8q8_q8_0(n: usize, xs: &[BlockF8Q8], ys: &[BlockQ8_0]) -> Result<f32> {
+    let qk = QK8_0;
+    if n % QK8_0 != 0 {
+        crate::bail!("vec_dot_f8q8_q8_0: {n} is not divisible by {qk}")
+    }
+    unsafe {
+        let mut acc = _mm256_setzero_ps();
+        for (x, y) in xs.iter().zip(ys.iter()) {
+            let d = _mm256_set1_ps(x.dq_d() * f16::to_f32(y.d));
+            let bx = _mm256_loadu_si256(x.qs.as_ptr() as *const __m256i);
+            let by = _mm256_loadu_si256(y.qs.as_ptr() as *const __m256i);
+            let q = mul_sum_i8_pairs_float(bx, by);
+            acc = _mm256_fmadd_ps(d, q, acc);
+        }
+        Ok(hsum_float_8(acc))
+    }
+}
+
 #[inline(always)]
 unsafe fn get_scale_shuffle(i: usize) -> __m128i {
     const K_SHUFFLE: [u8; 128] = [

diff --git a/candle-core/src/quantized/cuda.rs b/candle-core/src/quantized/cuda.rs
@@ -1,5 +1,5 @@
 use super::{GgmlDType, QStorage};
-use crate::quantized::k_quants::GgmlType;
+use crate::quantized::quants::GgmlType;
 use crate::{backend::BackendDevice, cuda_backend::WrapErr};
 use crate::{CudaDevice, CudaStorage, Result};
 use half::f16;

diff --git a/candle-core/src/quantized/ggml_file.rs b/candle-core/src/quantized/ggml_file.rs
@@ -1,6 +1,6 @@
 //! Support for the GGML file format.
 
-use super::{k_quants, GgmlDType, QStorage};
+use super::{iq_quants, k_quants, GgmlDType, QStorage};
 use crate::{Device, Result};
 use byteorder::{LittleEndian, ReadBytesExt};
 use std::collections::HashMap;
@@ -184,6 +184,9 @@ pub fn qtensor_from_ggml(
         GgmlDType::Q6K => {
             from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims, device)
         }
+        GgmlDType::Iq4Xs => {
+            from_raw_data::<iq_quants::BlockIQ4xs>(raw_data, size_in_bytes, dims, device)
+        }
         _ => crate::bail!("quantized type {ggml_dtype:?} is not supported yet"),
     }
 }