From 242e5a7da920ada78c6866ac7c540504a6941a36 Mon Sep 17 00:00:00 2001 From: marcus Date: Sat, 3 Feb 2024 13:40:12 -0800 Subject: [PATCH 1/6] attempt to add metal on mac --- llama-cpp-sys-2/Cargo.toml | 1 + llama-cpp-sys-2/build.rs | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml index 2acbe5ac..5e8dd230 100644 --- a/llama-cpp-sys-2/Cargo.toml +++ b/llama-cpp-sys-2/Cargo.toml @@ -43,3 +43,4 @@ cc = { workspace = true } [features] cublas = [] + diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index 720bc4cb..25d178bb 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -51,7 +51,28 @@ fn main() { // https://github.com/ggerganov/llama.cpp/blob/191221178f51b6e81122c5bda0fd79620e547d07/Makefile#L133-L141 if cfg!(target_os = "macos") { + assert!(!cublas_enabled, "CUBLAS is not supported on macOS"); + llama_cpp.define("_DARWIN_C_SOURCE", None); + + // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L340-L343 + llama_cpp.define("GGML_USE_METAL", None); + llama_cpp.define("GGML_USE_ACCELERATE", None); + llama_cpp.define("ACCELERATE_NEW_LAPACK", None); + llama_cpp.define("ACCELERATE_LAPACK_ILP64", None); + println!("cargo:rustc-link-lib=framework=Accelerate"); + + // MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit + // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L509-L511 + println!("cargo:rustc-link-lib=framework Foundation"); + println!("cargo:rustc-link-lib=framework Metal"); + println!("cargo:rustc-link-lib=framework MetalKit"); + + + // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L517-L520 + ggml + .file("llama.cpp/ggml-metal.m") + .file("llama.cpp/ggml-metal.h"); } if cfg!(target_os = "dragonfly") { llama_cpp.define("__BSD_VISIBLE", None); From 8a73403571626c99ceed73fc339abdc478cd7e1f Mon Sep 17 00:00:00 2001 From: Marcus Dunn Date: Sun, 4 Feb 2024 20:31:55 -0800 Subject: [PATCH 2/6] failing to link dur to "mach-o" --- llama-cpp-2/Cargo.toml | 4 ++ llama-cpp-2/benches/generate.rs | 61 ++++++++++++++++++++++++++ llama-cpp-2/benches/grammar_bias.rs | 2 +- llama-cpp-sys-2/build.rs | 68 +++++++++++++++++++++++++---- 4 files changed, 125 insertions(+), 10 deletions(-) create mode 100644 llama-cpp-2/benches/generate.rs diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml index c2605a43..eedceb16 100644 --- a/llama-cpp-2/Cargo.toml +++ b/llama-cpp-2/Cargo.toml @@ -26,6 +26,10 @@ anyhow = "1.0.79" name = "grammar_bias" harness = false +[[bench]] +name = "generate" +harness = false + [features] cublas = ["llama-cpp-sys-2/cublas"] diff --git a/llama-cpp-2/benches/generate.rs b/llama-cpp-2/benches/generate.rs new file mode 100644 index 00000000..c400d102 --- /dev/null +++ b/llama-cpp-2/benches/generate.rs @@ -0,0 +1,61 @@ +use anyhow::Context; +use criterion::{Criterion, criterion_group, criterion_main}; +use pprof::criterion::{Output, PProfProfiler}; +use llama_cpp_2::context::params::LlamaContextParams; +use llama_cpp_2::llama_backend::LlamaBackend; +use llama_cpp_2::llama_batch::LlamaBatch; +use llama_cpp_2::model::{AddBos, LlamaModel}; +use llama_cpp_2::model::params::LlamaModelParams; +use llama_cpp_2::token::data_array::LlamaTokenDataArray; + +fn generate(c: &mut Criterion) { + let api = hf_hub::api::sync::ApiBuilder::new() + .with_progress(true) + .build() + .unwrap(); + let file = api + .model("TheBloke/Llama-2-7B-Chat-GGUF".to_string()) + .get("llama-2-7b-chat.Q4_K_M.gguf") + .unwrap(); + let backend = LlamaBackend::init().unwrap(); + let model_params = LlamaModelParams::default(); + let model = LlamaModel::load_from_file(&backend, &file, &model_params).unwrap(); + let mut ctx = model + .new_context(&backend, LlamaContextParams::default()) + .unwrap(); + + c.bench_function("generate 50 tokens", |b| { + b.iter(|| { + let tokens_list = model.str_to_token("Hello, my name is", AddBos::Always).unwrap(); + let mut n_ctx = tokens_list.len() as i32; + let mut batch = LlamaBatch::new(512, 1); + let last_index: i32 = (tokens_list.len() - 1) as i32; + for (i, token) in (0_i32..).zip(tokens_list.into_iter()) { + let is_last = i == last_index; + batch.add(token, i, &[0], is_last).unwrap(); + } + ctx.decode(&mut batch).unwrap(); + + for _ in 0..50 { + let candidates = ctx.candidates_ith(batch.n_tokens() - 1); + let candidates_p = LlamaTokenDataArray::from_iter(candidates, false); + let new_token_id = ctx.sample_token_greedy(candidates_p); + if new_token_id == model.token_eos() { + break; + } + batch.clear(); + batch.add(new_token_id, n_ctx, &[0], true).unwrap(); + n_ctx += 1; + ctx.decode(&mut batch).unwrap(); + } + ctx.clear_kv_cache_seq(0, None, None) + }); + }); +} + +criterion_group!( + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = generate +); +criterion_main!(benches); \ No newline at end of file diff --git a/llama-cpp-2/benches/grammar_bias.rs b/llama-cpp-2/benches/grammar_bias.rs index 23681ab0..25fd90df 100644 --- a/llama-cpp-2/benches/grammar_bias.rs +++ b/llama-cpp-2/benches/grammar_bias.rs @@ -32,7 +32,7 @@ fn criterion_benchmark(c: &mut Criterion) { let model_params = LlamaModelParams::default(); let model = LlamaModel::load_from_file(&backend, &file, &model_params).unwrap(); let mut ctx = model - .new_context(&backend, &LlamaContextParams::default()) + .new_context(&backend, LlamaContextParams::default()) .unwrap(); let grammar = LlamaGrammar::from_str(include_str!("../src/grammar/json.gbnf")).unwrap(); diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index 25d178bb..20f9b17a 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -13,6 +13,7 @@ fn main() { let mut ggml = cc::Build::new(); let mut ggml_cuda = if cublas_enabled { Some(cc::Build::new()) } else { None }; + let mut ggml_metal= if cfg!(target_os = "macos") { Some(cc::Build::new()) } else { None }; let mut llama_cpp = cc::Build::new(); ggml.cpp(false); @@ -60,20 +61,22 @@ fn main() { llama_cpp.define("GGML_USE_ACCELERATE", None); llama_cpp.define("ACCELERATE_NEW_LAPACK", None); llama_cpp.define("ACCELERATE_LAPACK_ILP64", None); - println!("cargo:rustc-link-lib=framework=Accelerate"); + println!("cargo:rustc-link-arg=framework=Accelerate"); // MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L509-L511 - println!("cargo:rustc-link-lib=framework Foundation"); - println!("cargo:rustc-link-lib=framework Metal"); - println!("cargo:rustc-link-lib=framework MetalKit"); - + println!("cargo:rustc-link-arg=framework=Foundation"); + println!("cargo:rustc-link-arg=framework=Metal"); + println!("cargo:rustc-link-arg=framework=MetalKit"); + } - // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L517-L520 - ggml - .file("llama.cpp/ggml-metal.m") - .file("llama.cpp/ggml-metal.h"); + if let Some(ggml_metal) = &mut ggml_metal { + metal_hack(ggml_metal); + ggml_metal + .file("llama.cpp/ggml-metal") + .include("llama.cpp"); } + if cfg!(target_os = "dragonfly") { llama_cpp.define("__BSD_VISIBLE", None); } @@ -83,6 +86,12 @@ fn main() { ggml_cuda.compile("ggml-cuda"); } + + if let Some(ggml_metal) = ggml_metal { + println!("compiling ggml-metal"); + ggml_metal.compile("ggml-metal") + } + if cfg!(target_os = "linux") { ggml.define("_GNU_SOURCE", None); } @@ -97,6 +106,7 @@ fn main() { llama_cpp .define("_XOPEN_SOURCE", Some("600")) + .include("llama.cpp") .std("c++17") .file("llama.cpp/llama.cpp"); @@ -124,3 +134,43 @@ fn main() { .write_to_file(out_path.join("bindings.rs")) .expect("failed to write bindings to file"); } + + +// courtesy of https://github.com/rustformers/llm +fn metal_hack(build: &mut cc::Build) { + const GGML_METAL_METAL_PATH: &str = "llama.cpp/ggml-metal.metal"; + const GGML_METAL_PATH: &str = "llama.cpp/ggml-metal.m"; + + let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is not defined")); + + let ggml_metal_path = { + let ggml_metal_metal = std::fs::read_to_string(GGML_METAL_METAL_PATH) + .expect("Could not read ggml-metal.metal") + .replace('\\', "\\\\") + .replace('\n', "\\n") + .replace('\r', "\\r") + .replace('\"', "\\\""); + + let ggml_metal = + std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m"); + + let needle = r#"NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];"#; + if !ggml_metal.contains(needle) { + panic!("ggml-metal.m does not contain the needle to be replaced; the patching logic needs to be reinvestigated. Contact a `llama-cpp-sys-2` developer!"); + } + + // Replace the runtime read of the file with a compile-time string + let ggml_metal = ggml_metal.replace( + needle, + &format!(r#"NSString * src = @"{ggml_metal_metal}";"#), + ); + + let patched_ggml_metal_path = out_dir.join("ggml-metal.m"); + std::fs::write(&patched_ggml_metal_path, ggml_metal) + .expect("Could not write temporary patched ggml-metal.m"); + + patched_ggml_metal_path + }; + + build.file(ggml_metal_path); +} \ No newline at end of file From 8c61f584e7aa200581b711147e685821190aa025 Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Thu, 22 Feb 2024 08:25:36 -1000 Subject: [PATCH 3/6] Working build.rs for apple metal --- llama-cpp-sys-2/build.rs | 43 +++++++++++++--------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index 4703741f..beb00cc4 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -12,15 +12,13 @@ fn main() { } let mut ggml = cc::Build::new(); - let mut ggml_cuda = if cublas_enabled { Some(cc::Build::new()) } else { None }; - let mut ggml_metal= if cfg!(target_os = "macos") { Some(cc::Build::new()) } else { None }; let mut llama_cpp = cc::Build::new(); ggml.cpp(false); llama_cpp.cpp(true); // https://github.com/ggerganov/llama.cpp/blob/a836c8f534ab789b02da149fbdaf7735500bff74/Makefile#L364-L368 - if let Some(ggml_cuda) = &mut ggml_cuda { + if cublas_enabled { for lib in [ "cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt", ] { @@ -30,25 +28,22 @@ fn main() { println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); if cfg!(target_arch = "aarch64") { - ggml_cuda - .flag_if_supported("-mfp16-format=ieee") + ggml.flag_if_supported("-mfp16-format=ieee") .flag_if_supported("-mno-unaligned-access"); llama_cpp .flag_if_supported("-mfp16-format=ieee") .flag_if_supported("-mno-unaligned-access"); - ggml_cuda - .flag_if_supported("-mfp16-format=ieee") + ggml.flag_if_supported("-mfp16-format=ieee") .flag_if_supported("-mno-unaligned-access"); } - ggml_cuda - .cuda(true) + ggml.cuda(true) .std("c++17") .flag("-arch=all") .file("llama.cpp/ggml-cuda.cu"); ggml.define("GGML_USE_CUBLAS", None); - ggml_cuda.define("GGML_USE_CUBLAS", None); + ggml.define("GGML_USE_CUBLAS", None); llama_cpp.define("GGML_USE_CUBLAS", None); } @@ -56,6 +51,11 @@ fn main() { if cfg!(target_os = "macos") { assert!(!cublas_enabled, "CUBLAS is not supported on macOS"); + println!("cargo:rustc-link-lib=framework=Metal"); + println!("cargo:rustc-link-lib=framework=Foundation"); + println!("cargo:rustc-link-lib=framework=MetalPerformanceShaders"); + println!("cargo:rustc-link-lib=framework=MetalKit"); + llama_cpp.define("_DARWIN_C_SOURCE", None); // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L340-L343 @@ -70,35 +70,21 @@ fn main() { println!("cargo:rustc-link-arg=framework=Foundation"); println!("cargo:rustc-link-arg=framework=Metal"); println!("cargo:rustc-link-arg=framework=MetalKit"); - } - if let Some(ggml_metal) = &mut ggml_metal { - metal_hack(ggml_metal); - ggml_metal - .file("llama.cpp/ggml-metal") - .include("llama.cpp"); + metal_hack(&mut ggml); + ggml.include("./llama.cpp/ggml-metal.h"); } if cfg!(target_os = "dragonfly") { llama_cpp.define("__BSD_VISIBLE", None); } - if let Some(ggml_cuda) = ggml_cuda { - println!("compiling ggml-cuda"); - ggml_cuda.compile("ggml-cuda"); - } - - - if let Some(ggml_metal) = ggml_metal { - println!("compiling ggml-metal"); - ggml_metal.compile("ggml-metal") - } - if cfg!(target_os = "linux") { ggml.define("_GNU_SOURCE", None); } ggml.std("c17") + .include("./llama.cpp") .file("llama.cpp/ggml.c") .file("llama.cpp/ggml-alloc.c") .file("llama.cpp/ggml-backend.c") @@ -136,7 +122,6 @@ fn main() { .expect("failed to write bindings to file"); } - // courtesy of https://github.com/rustformers/llm fn metal_hack(build: &mut cc::Build) { const GGML_METAL_METAL_PATH: &str = "llama.cpp/ggml-metal.metal"; @@ -174,4 +159,4 @@ fn metal_hack(build: &mut cc::Build) { }; build.file(ggml_metal_path); -} \ No newline at end of file +} From eae1f1186059e863fe4a846d36f42c879c1075f1 Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Thu, 22 Feb 2024 17:15:02 -1000 Subject: [PATCH 4/6] Remove unnecessary flags --- llama-cpp-sys-2/build.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index beb00cc4..42c1e845 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -65,12 +65,6 @@ fn main() { llama_cpp.define("ACCELERATE_LAPACK_ILP64", None); println!("cargo:rustc-link-arg=framework=Accelerate"); - // MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit - // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L509-L511 - println!("cargo:rustc-link-arg=framework=Foundation"); - println!("cargo:rustc-link-arg=framework=Metal"); - println!("cargo:rustc-link-arg=framework=MetalKit"); - metal_hack(&mut ggml); ggml.include("./llama.cpp/ggml-metal.h"); } From 6a9ab152d5b46539164705dbd56558929bb5de84 Mon Sep 17 00:00:00 2001 From: marcus Date: Sun, 25 Feb 2024 10:29:20 -0800 Subject: [PATCH 5/6] fixed cuda --- Cargo.lock | 4 ++-- llama-cpp-sys-2/build.rs | 32 +++++++++++++++++++++++--------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b2552fb9..dd4f5a88 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -714,7 +714,7 @@ checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" [[package]] name = "llama-cpp-2" -version = "0.1.27" +version = "0.1.28" dependencies = [ "anyhow", "clap", @@ -728,7 +728,7 @@ dependencies = [ [[package]] name = "llama-cpp-sys-2" -version = "0.1.27" +version = "0.1.28" dependencies = [ "bindgen", "cc", diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index e1693860..c223bd61 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -7,6 +7,8 @@ fn main() { let cublas_enabled = env::var("CARGO_FEATURE_CUBLAS").is_ok(); + let mut ggml_cuda = if cublas_enabled { Some(cc::Build::new()) } else { None }; + if !Path::new("llama.cpp/ggml.c").exists() { panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.") } @@ -18,7 +20,7 @@ fn main() { llama_cpp.cpp(true); // https://github.com/ggerganov/llama.cpp/blob/a836c8f534ab789b02da149fbdaf7735500bff74/Makefile#L364-L368 - if cublas_enabled { + if let Some(ggml_cuda) = &mut ggml_cuda { for lib in [ "cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt", ] { @@ -33,6 +35,9 @@ fn main() { println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); if cfg!(target_arch = "aarch64") { + ggml_cuda + .flag_if_supported("-mfp16-format=ieee") + .flag_if_supported("-mno-unaligned-access"); ggml.flag_if_supported("-mfp16-format=ieee") .flag_if_supported("-mno-unaligned-access"); llama_cpp @@ -42,21 +47,22 @@ fn main() { .flag_if_supported("-mno-unaligned-access"); } - - ggml + ggml_cuda .cuda(true) - .std("c++17") .flag("-arch=all") - .file("llama.cpp/ggml-cuda.cu"); + .file("llama.cpp/ggml-cuda.cu") + .include("llama.cpp"); if ggml_cuda.get_compiler().is_like_msvc() { + // someone with windows should check if this works @ cc++11 + // this case was added when we used c++17 (which was not what llama.cpp used) ggml_cuda.std("c++14"); } else { - ggml_cuda.std("c++17"); + ggml_cuda.std("c++11"); } ggml.define("GGML_USE_CUBLAS", None); - ggml.define("GGML_USE_CUBLAS", None); + ggml_cuda.define("GGML_USE_CUBLAS", None); llama_cpp.define("GGML_USE_CUBLAS", None); } @@ -90,7 +96,7 @@ fn main() { ggml.define("_GNU_SOURCE", None); } - ggml.std("c17") + ggml.std("c11") .include("./llama.cpp") .file("llama.cpp/ggml.c") .file("llama.cpp/ggml-alloc.c") @@ -101,14 +107,22 @@ fn main() { llama_cpp .define("_XOPEN_SOURCE", Some("600")) .include("llama.cpp") - .std("c++17") + .std("c++11") .file("llama.cpp/llama.cpp"); + if let Some(ggml_cuda) = ggml_cuda { + println!("compiling ggml-cuda"); + ggml_cuda.compile("ggml-cuda"); + println!("compiled ggml-cuda"); + } + println!("compiling ggml"); ggml.compile("ggml"); + println!("compiled ggml"); println!("compiling llama"); llama_cpp.compile("llama"); + println!("compiled llama"); let header = "llama.cpp/llama.h"; From b6e0bf72e914b45b6ab2c71279dcb514f022824f Mon Sep 17 00:00:00 2001 From: marcus Date: Sun, 25 Feb 2024 10:49:16 -0800 Subject: [PATCH 6/6] fix for docker build --- llama-cpp-sys-2/build.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index c223bd61..b658e0de 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -54,11 +54,11 @@ fn main() { .include("llama.cpp"); if ggml_cuda.get_compiler().is_like_msvc() { - // someone with windows should check if this works @ cc++11 - // this case was added when we used c++17 (which was not what llama.cpp used) ggml_cuda.std("c++14"); } else { - ggml_cuda.std("c++11"); + ggml_cuda + .flag("-std=c++11") + .std("c++11"); } ggml.define("GGML_USE_CUBLAS", None);