Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement I-quants #67

Open
wants to merge 30 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
bcbbf14
Sketch iq quants?
EricLBuehler Nov 15, 2024
3b5dd2a
Metal qmatmul mat-mat product (#39)
EricLBuehler Jan 12, 2025
3168dc2
Initial impl
EricLBuehler Jan 12, 2025
0c55e36
Work
EricLBuehler Jan 13, 2025
5f76314
Merge branch 'main' into iq_quants
EricLBuehler Jan 17, 2025
533e2d4
Debugging
EricLBuehler Jan 17, 2025
ac9f7c7
Remove comments
EricLBuehler Jan 18, 2025
9c23e27
Add cpu impl for isq4x
EricLBuehler Feb 3, 2025
cabf37f
Implement neon vec dot
EricLBuehler Feb 3, 2025
0dce868
Small optimization
EricLBuehler Feb 3, 2025
8f111f7
Clippy
EricLBuehler Feb 3, 2025
a84b269
Merge branch 'main' into iq_quants
EricLBuehler Feb 3, 2025
dbe13c2
Revert mask in sdpa vector kernels
EricLBuehler Feb 3, 2025
8b83017
Merge branch 'main' into iq_quants
EricLBuehler Feb 3, 2025
9e0f120
Update check_shape
EricLBuehler Feb 3, 2025
7ff6e09
Update check_shape
EricLBuehler Feb 4, 2025
e4efab3
Support qtensor_from_ggml
EricLBuehler Feb 4, 2025
1baec7c
[AArch64] Quantized MatMul performance improvement on Arm CPUs
bgergely0 Oct 27, 2023
25cbfca
Add metal support
EricLBuehler Feb 6, 2025
8456349
debug
EricLBuehler Feb 6, 2025
3a94e5e
debug
EricLBuehler Feb 6, 2025
a79581c
debug
EricLBuehler Feb 6, 2025
e4d5edc
FIx kernel name for mv
EricLBuehler Feb 6, 2025
e9c139d
Add metal, neon iq4nl
EricLBuehler Feb 6, 2025
f5cca34
Fix missing handling in from_u32
EricLBuehler Feb 6, 2025
9db5390
Sketch iq3xxs
EricLBuehler Feb 6, 2025
d3909ae
Some fixes
EricLBuehler Feb 7, 2025
0f862b2
Add f8q8
EricLBuehler Feb 7, 2025
9557e23
Fix f8q8 dequant
EricLBuehler Feb 8, 2025
48804a5
Fix i8mm
EricLBuehler Feb 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions candle-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ gemm = { workspace = true }
half = { workspace = true }
float8 = { workspace = true }
intel-mkl-src = { workspace = true, optional = true }
itertools = "0.12.1"
libc = { workspace = true, optional = true }
memmap2 = { workspace = true }
num-traits = { workspace = true }
Expand All @@ -46,6 +47,7 @@ nccl = ["cuda", "cudarc/nccl"]
mkl = ["dep:libc", "dep:intel-mkl-src"]
accelerate = ["dep:libc", "dep:accelerate-src"]
metal = ["dep:metal", "dep:candle-metal-kernels"]
arm-nightly-feat = []

[[bench]]
name = "bench_main"
Expand Down
5 changes: 5 additions & 0 deletions candle-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@
//! - [candle-transformers](https://docs.rs/candle-transformers/). Candle implemntation of many published transformer models.
//!

#![cfg_attr(feature = "arm-nightly-feat", feature(stdarch_neon_dotprod))]
#![cfg_attr(feature = "arm-nightly-feat", feature(array_chunks))]
#![cfg_attr(feature = "arm-nightly-feat", feature(stdarch_neon_i8mm))]
#![cfg_attr(feature = "arm-nightly-feat", feature(portable_simd))]

#[cfg(feature = "accelerate")]
mod accelerate;
pub mod backend;
Expand Down
22 changes: 21 additions & 1 deletion candle-core/src/quantized/avx.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use super::k_quants::{
BlockQ2K, BlockQ3K, BlockQ4K, BlockQ4_0, BlockQ5K, BlockQ6K, BlockQ8K, BlockQ8_0, QK8_0, QK_K,
BlockF8Q8, BlockQ2K, BlockQ3K, BlockQ4K, BlockQ4_0, BlockQ5K, BlockQ6K, BlockQ8K, BlockQ8_0,
QK8_0, QK_K,
};
use crate::Result;
use byteorder::{ByteOrder, LittleEndian};
Expand Down Expand Up @@ -87,6 +88,25 @@ pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) ->
}
}

#[inline(always)]
pub(crate) fn vec_dot_f8q8_q8_0(n: usize, xs: &[BlockF8Q8], ys: &[BlockQ8_0]) -> Result<f32> {
let qk = QK8_0;
if n % QK8_0 != 0 {
crate::bail!("vec_dot_f8q8_q8_0: {n} is not divisible by {qk}")
}
unsafe {
let mut acc = _mm256_setzero_ps();
for (x, y) in xs.iter().zip(ys.iter()) {
let d = _mm256_set1_ps(x.dq_d() * f16::to_f32(y.d));
let bx = _mm256_loadu_si256(x.qs.as_ptr() as *const __m256i);
let by = _mm256_loadu_si256(y.qs.as_ptr() as *const __m256i);
let q = mul_sum_i8_pairs_float(bx, by);
acc = _mm256_fmadd_ps(d, q, acc);
}
Ok(hsum_float_8(acc))
}
}

#[inline(always)]
unsafe fn get_scale_shuffle(i: usize) -> __m128i {
const K_SHUFFLE: [u8; 128] = [
Expand Down
2 changes: 1 addition & 1 deletion candle-core/src/quantized/cuda.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::{GgmlDType, QStorage};
use crate::quantized::k_quants::GgmlType;
use crate::quantized::quants::GgmlType;
use crate::{backend::BackendDevice, cuda_backend::WrapErr};
use crate::{CudaDevice, CudaStorage, Result};
use half::f16;
Expand Down
5 changes: 4 additions & 1 deletion candle-core/src/quantized/ggml_file.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! Support for the GGML file format.

use super::{k_quants, GgmlDType, QStorage};
use super::{iq_quants, k_quants, GgmlDType, QStorage};
use crate::{Device, Result};
use byteorder::{LittleEndian, ReadBytesExt};
use std::collections::HashMap;
Expand Down Expand Up @@ -184,6 +184,9 @@ pub fn qtensor_from_ggml(
GgmlDType::Q6K => {
from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims, device)
}
GgmlDType::Iq4Xs => {
from_raw_data::<iq_quants::BlockIQ4xs>(raw_data, size_in_bytes, dims, device)
}
_ => crate::bail!("quantized type {ggml_dtype:?} is not supported yet"),
}
}
Expand Down
Loading
Loading