From 242e5a7da920ada78c6866ac7c540504a6941a36 Mon Sep 17 00:00:00 2001
From: marcus <marcus.s.dunn@gmail.com>
Date: Sat, 3 Feb 2024 13:40:12 -0800
Subject: [PATCH 1/6] attempt to add metal on mac

---
 llama-cpp-sys-2/Cargo.toml |  1 +
 llama-cpp-sys-2/build.rs   | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 2acbe5ac..5e8dd230 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -43,3 +43,4 @@ cc = { workspace = true }
 
 [features]
 cublas = []
+
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 720bc4cb..25d178bb 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -51,7 +51,28 @@ fn main() {
 
     // https://github.com/ggerganov/llama.cpp/blob/191221178f51b6e81122c5bda0fd79620e547d07/Makefile#L133-L141
     if cfg!(target_os = "macos") {
+        assert!(!cublas_enabled, "CUBLAS is not supported on macOS");
+
         llama_cpp.define("_DARWIN_C_SOURCE", None);
+
+        // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L340-L343
+        llama_cpp.define("GGML_USE_METAL", None);
+        llama_cpp.define("GGML_USE_ACCELERATE", None);
+        llama_cpp.define("ACCELERATE_NEW_LAPACK", None);
+        llama_cpp.define("ACCELERATE_LAPACK_ILP64", None);
+        println!("cargo:rustc-link-lib=framework=Accelerate");
+
+        // 	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
+        // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L509-L511
+        println!("cargo:rustc-link-lib=framework Foundation");
+        println!("cargo:rustc-link-lib=framework Metal");
+        println!("cargo:rustc-link-lib=framework MetalKit");
+
+
+        // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L517-L520
+        ggml
+            .file("llama.cpp/ggml-metal.m")
+            .file("llama.cpp/ggml-metal.h");
     }
     if cfg!(target_os = "dragonfly") {
         llama_cpp.define("__BSD_VISIBLE", None);

From 8a73403571626c99ceed73fc339abdc478cd7e1f Mon Sep 17 00:00:00 2001
From: Marcus Dunn <marcus.s.dunn@gmail.com>
Date: Sun, 4 Feb 2024 20:31:55 -0800
Subject: [PATCH 2/6] failing to link dur to "mach-o"

---
 llama-cpp-2/Cargo.toml              |  4 ++
 llama-cpp-2/benches/generate.rs     | 61 ++++++++++++++++++++++++++
 llama-cpp-2/benches/grammar_bias.rs |  2 +-
 llama-cpp-sys-2/build.rs            | 68 +++++++++++++++++++++++++----
 4 files changed, 125 insertions(+), 10 deletions(-)
 create mode 100644 llama-cpp-2/benches/generate.rs

diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index c2605a43..eedceb16 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -26,6 +26,10 @@ anyhow = "1.0.79"
 name = "grammar_bias"
 harness = false
 
+[[bench]]
+name = "generate"
+harness = false
+
 [features]
 cublas = ["llama-cpp-sys-2/cublas"]
 
diff --git a/llama-cpp-2/benches/generate.rs b/llama-cpp-2/benches/generate.rs
new file mode 100644
index 00000000..c400d102
--- /dev/null
+++ b/llama-cpp-2/benches/generate.rs
@@ -0,0 +1,61 @@
+use anyhow::Context;
+use criterion::{Criterion, criterion_group, criterion_main};
+use pprof::criterion::{Output, PProfProfiler};
+use llama_cpp_2::context::params::LlamaContextParams;
+use llama_cpp_2::llama_backend::LlamaBackend;
+use llama_cpp_2::llama_batch::LlamaBatch;
+use llama_cpp_2::model::{AddBos, LlamaModel};
+use llama_cpp_2::model::params::LlamaModelParams;
+use llama_cpp_2::token::data_array::LlamaTokenDataArray;
+
+fn generate(c: &mut Criterion) {
+    let api = hf_hub::api::sync::ApiBuilder::new()
+        .with_progress(true)
+        .build()
+        .unwrap();
+    let file = api
+        .model("TheBloke/Llama-2-7B-Chat-GGUF".to_string())
+        .get("llama-2-7b-chat.Q4_K_M.gguf")
+        .unwrap();
+    let backend = LlamaBackend::init().unwrap();
+    let model_params = LlamaModelParams::default();
+    let model = LlamaModel::load_from_file(&backend, &file, &model_params).unwrap();
+    let mut ctx = model
+        .new_context(&backend, LlamaContextParams::default())
+        .unwrap();
+
+    c.bench_function("generate 50 tokens", |b| {
+        b.iter(|| {
+            let tokens_list = model.str_to_token("Hello, my name is", AddBos::Always).unwrap();
+            let mut n_ctx = tokens_list.len() as i32;
+            let mut batch = LlamaBatch::new(512, 1);
+            let last_index: i32 = (tokens_list.len() - 1) as i32;
+            for (i, token) in (0_i32..).zip(tokens_list.into_iter()) {
+                let is_last = i == last_index;
+                batch.add(token, i, &[0], is_last).unwrap();
+            }
+            ctx.decode(&mut batch).unwrap();
+
+            for _ in 0..50 {
+                let candidates = ctx.candidates_ith(batch.n_tokens() - 1);
+                let candidates_p = LlamaTokenDataArray::from_iter(candidates, false);
+                let new_token_id = ctx.sample_token_greedy(candidates_p);
+                if new_token_id == model.token_eos() {
+                    break;
+                }
+                batch.clear();
+                batch.add(new_token_id, n_ctx, &[0], true).unwrap();
+                n_ctx += 1;
+                ctx.decode(&mut batch).unwrap();
+            }
+            ctx.clear_kv_cache_seq(0, None, None)
+        });
+    });
+}
+
+criterion_group!(
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = generate
+);
+criterion_main!(benches);
\ No newline at end of file
diff --git a/llama-cpp-2/benches/grammar_bias.rs b/llama-cpp-2/benches/grammar_bias.rs
index 23681ab0..25fd90df 100644
--- a/llama-cpp-2/benches/grammar_bias.rs
+++ b/llama-cpp-2/benches/grammar_bias.rs
@@ -32,7 +32,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     let model_params = LlamaModelParams::default();
     let model = LlamaModel::load_from_file(&backend, &file, &model_params).unwrap();
     let mut ctx = model
-        .new_context(&backend, &LlamaContextParams::default())
+        .new_context(&backend, LlamaContextParams::default())
         .unwrap();
     let grammar = LlamaGrammar::from_str(include_str!("../src/grammar/json.gbnf")).unwrap();
 
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 25d178bb..20f9b17a 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -13,6 +13,7 @@ fn main() {
 
     let mut ggml = cc::Build::new();
     let mut ggml_cuda = if cublas_enabled { Some(cc::Build::new()) } else { None };
+    let mut ggml_metal= if cfg!(target_os = "macos") { Some(cc::Build::new()) } else { None };
     let mut llama_cpp = cc::Build::new();
 
     ggml.cpp(false);
@@ -60,20 +61,22 @@ fn main() {
         llama_cpp.define("GGML_USE_ACCELERATE", None);
         llama_cpp.define("ACCELERATE_NEW_LAPACK", None);
         llama_cpp.define("ACCELERATE_LAPACK_ILP64", None);
-        println!("cargo:rustc-link-lib=framework=Accelerate");
+        println!("cargo:rustc-link-arg=framework=Accelerate");
 
         // 	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
         // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L509-L511
-        println!("cargo:rustc-link-lib=framework Foundation");
-        println!("cargo:rustc-link-lib=framework Metal");
-        println!("cargo:rustc-link-lib=framework MetalKit");
-
+        println!("cargo:rustc-link-arg=framework=Foundation");
+        println!("cargo:rustc-link-arg=framework=Metal");
+        println!("cargo:rustc-link-arg=framework=MetalKit");
+    }
 
-        // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L517-L520
-        ggml
-            .file("llama.cpp/ggml-metal.m")
-            .file("llama.cpp/ggml-metal.h");
+    if let Some(ggml_metal) = &mut ggml_metal {
+        metal_hack(ggml_metal);
+        ggml_metal
+            .file("llama.cpp/ggml-metal")
+            .include("llama.cpp");
     }
+
     if cfg!(target_os = "dragonfly") {
         llama_cpp.define("__BSD_VISIBLE", None);
     }
@@ -83,6 +86,12 @@ fn main() {
         ggml_cuda.compile("ggml-cuda");
     }
 
+
+    if let Some(ggml_metal) = ggml_metal {
+        println!("compiling ggml-metal");
+        ggml_metal.compile("ggml-metal")
+    }
+
     if cfg!(target_os = "linux") {
         ggml.define("_GNU_SOURCE", None);
     }
@@ -97,6 +106,7 @@ fn main() {
 
     llama_cpp
         .define("_XOPEN_SOURCE", Some("600"))
+        .include("llama.cpp")
         .std("c++17")
         .file("llama.cpp/llama.cpp");
 
@@ -124,3 +134,43 @@ fn main() {
         .write_to_file(out_path.join("bindings.rs"))
         .expect("failed to write bindings to file");
 }
+
+
+// courtesy of https://github.com/rustformers/llm
+fn metal_hack(build: &mut cc::Build) {
+    const GGML_METAL_METAL_PATH: &str = "llama.cpp/ggml-metal.metal";
+    const GGML_METAL_PATH: &str = "llama.cpp/ggml-metal.m";
+
+    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is not defined"));
+
+    let ggml_metal_path = {
+        let ggml_metal_metal = std::fs::read_to_string(GGML_METAL_METAL_PATH)
+            .expect("Could not read ggml-metal.metal")
+            .replace('\\', "\\\\")
+            .replace('\n', "\\n")
+            .replace('\r', "\\r")
+            .replace('\"', "\\\"");
+
+        let ggml_metal =
+            std::fs::read_to_string(GGML_METAL_PATH).expect("Could not read ggml-metal.m");
+
+        let needle = r#"NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];"#;
+        if !ggml_metal.contains(needle) {
+            panic!("ggml-metal.m does not contain the needle to be replaced; the patching logic needs to be reinvestigated. Contact a `llama-cpp-sys-2` developer!");
+        }
+
+        // Replace the runtime read of the file with a compile-time string
+        let ggml_metal = ggml_metal.replace(
+            needle,
+            &format!(r#"NSString * src  = @"{ggml_metal_metal}";"#),
+        );
+
+        let patched_ggml_metal_path = out_dir.join("ggml-metal.m");
+        std::fs::write(&patched_ggml_metal_path, ggml_metal)
+            .expect("Could not write temporary patched ggml-metal.m");
+
+        patched_ggml_metal_path
+    };
+
+    build.file(ggml_metal_path);
+}
\ No newline at end of file

From 8c61f584e7aa200581b711147e685821190aa025 Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Thu, 22 Feb 2024 08:25:36 -1000
Subject: [PATCH 3/6] Working build.rs for apple metal

---
 llama-cpp-sys-2/build.rs | 43 +++++++++++++---------------------------
 1 file changed, 14 insertions(+), 29 deletions(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 4703741f..beb00cc4 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -12,15 +12,13 @@ fn main() {
     }
 
     let mut ggml = cc::Build::new();
-    let mut ggml_cuda = if cublas_enabled { Some(cc::Build::new()) } else { None };
-    let mut ggml_metal= if cfg!(target_os = "macos") { Some(cc::Build::new()) } else { None };
     let mut llama_cpp = cc::Build::new();
 
     ggml.cpp(false);
     llama_cpp.cpp(true);
 
     // https://github.com/ggerganov/llama.cpp/blob/a836c8f534ab789b02da149fbdaf7735500bff74/Makefile#L364-L368
-    if let Some(ggml_cuda) = &mut ggml_cuda {
+    if cublas_enabled {
         for lib in [
             "cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt",
         ] {
@@ -30,25 +28,22 @@ fn main() {
         println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
 
         if cfg!(target_arch = "aarch64") {
-            ggml_cuda
-                .flag_if_supported("-mfp16-format=ieee")
+            ggml.flag_if_supported("-mfp16-format=ieee")
                 .flag_if_supported("-mno-unaligned-access");
             llama_cpp
                 .flag_if_supported("-mfp16-format=ieee")
                 .flag_if_supported("-mno-unaligned-access");
-            ggml_cuda
-                .flag_if_supported("-mfp16-format=ieee")
+            ggml.flag_if_supported("-mfp16-format=ieee")
                 .flag_if_supported("-mno-unaligned-access");
         }
 
-        ggml_cuda
-            .cuda(true)
+        ggml.cuda(true)
             .std("c++17")
             .flag("-arch=all")
             .file("llama.cpp/ggml-cuda.cu");
 
         ggml.define("GGML_USE_CUBLAS", None);
-        ggml_cuda.define("GGML_USE_CUBLAS", None);
+        ggml.define("GGML_USE_CUBLAS", None);
         llama_cpp.define("GGML_USE_CUBLAS", None);
     }
 
@@ -56,6 +51,11 @@ fn main() {
     if cfg!(target_os = "macos") {
         assert!(!cublas_enabled, "CUBLAS is not supported on macOS");
 
+        println!("cargo:rustc-link-lib=framework=Metal");
+        println!("cargo:rustc-link-lib=framework=Foundation");
+        println!("cargo:rustc-link-lib=framework=MetalPerformanceShaders");
+        println!("cargo:rustc-link-lib=framework=MetalKit");
+
         llama_cpp.define("_DARWIN_C_SOURCE", None);
 
         // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L340-L343
@@ -70,35 +70,21 @@ fn main() {
         println!("cargo:rustc-link-arg=framework=Foundation");
         println!("cargo:rustc-link-arg=framework=Metal");
         println!("cargo:rustc-link-arg=framework=MetalKit");
-    }
 
-    if let Some(ggml_metal) = &mut ggml_metal {
-        metal_hack(ggml_metal);
-        ggml_metal
-            .file("llama.cpp/ggml-metal")
-            .include("llama.cpp");
+        metal_hack(&mut ggml);
+        ggml.include("./llama.cpp/ggml-metal.h");
     }
 
     if cfg!(target_os = "dragonfly") {
         llama_cpp.define("__BSD_VISIBLE", None);
     }
 
-    if let Some(ggml_cuda) = ggml_cuda {
-        println!("compiling ggml-cuda");
-        ggml_cuda.compile("ggml-cuda");
-    }
-
-
-    if let Some(ggml_metal) = ggml_metal {
-        println!("compiling ggml-metal");
-        ggml_metal.compile("ggml-metal")
-    }
-
     if cfg!(target_os = "linux") {
         ggml.define("_GNU_SOURCE", None);
     }
 
     ggml.std("c17")
+        .include("./llama.cpp")
         .file("llama.cpp/ggml.c")
         .file("llama.cpp/ggml-alloc.c")
         .file("llama.cpp/ggml-backend.c")
@@ -136,7 +122,6 @@ fn main() {
         .expect("failed to write bindings to file");
 }
 
-
 // courtesy of https://github.com/rustformers/llm
 fn metal_hack(build: &mut cc::Build) {
     const GGML_METAL_METAL_PATH: &str = "llama.cpp/ggml-metal.metal";
@@ -174,4 +159,4 @@ fn metal_hack(build: &mut cc::Build) {
     };
 
     build.file(ggml_metal_path);
-}
\ No newline at end of file
+}

From eae1f1186059e863fe4a846d36f42c879c1075f1 Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Thu, 22 Feb 2024 17:15:02 -1000
Subject: [PATCH 4/6] Remove unnecessary flags

---
 llama-cpp-sys-2/build.rs | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index beb00cc4..42c1e845 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -65,12 +65,6 @@ fn main() {
         llama_cpp.define("ACCELERATE_LAPACK_ILP64", None);
         println!("cargo:rustc-link-arg=framework=Accelerate");
 
-        // 	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
-        // https://github.com/ggerganov/llama.cpp/blob/3c0d25c4756742ebf15ad44700fabc0700c638bd/Makefile#L509-L511
-        println!("cargo:rustc-link-arg=framework=Foundation");
-        println!("cargo:rustc-link-arg=framework=Metal");
-        println!("cargo:rustc-link-arg=framework=MetalKit");
-
         metal_hack(&mut ggml);
         ggml.include("./llama.cpp/ggml-metal.h");
     }

From 6a9ab152d5b46539164705dbd56558929bb5de84 Mon Sep 17 00:00:00 2001
From: marcus <marcus.s.dunn@gmail.com>
Date: Sun, 25 Feb 2024 10:29:20 -0800
Subject: [PATCH 5/6] fixed cuda

---
 Cargo.lock               |  4 ++--
 llama-cpp-sys-2/build.rs | 32 +++++++++++++++++++++++---------
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b2552fb9..dd4f5a88 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -714,7 +714,7 @@ checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.27"
+version = "0.1.28"
 dependencies = [
  "anyhow",
  "clap",
@@ -728,7 +728,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.27"
+version = "0.1.28"
 dependencies = [
  "bindgen",
  "cc",
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index e1693860..c223bd61 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -7,6 +7,8 @@ fn main() {
 
     let cublas_enabled = env::var("CARGO_FEATURE_CUBLAS").is_ok();
 
+    let mut ggml_cuda = if cublas_enabled { Some(cc::Build::new()) } else { None };
+
     if !Path::new("llama.cpp/ggml.c").exists() {
         panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.")
     }
@@ -18,7 +20,7 @@ fn main() {
     llama_cpp.cpp(true);
 
     // https://github.com/ggerganov/llama.cpp/blob/a836c8f534ab789b02da149fbdaf7735500bff74/Makefile#L364-L368
-    if cublas_enabled {
+    if let Some(ggml_cuda) = &mut ggml_cuda {
         for lib in [
             "cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt",
         ] {
@@ -33,6 +35,9 @@ fn main() {
         println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
 
         if cfg!(target_arch = "aarch64") {
+            ggml_cuda
+                .flag_if_supported("-mfp16-format=ieee")
+                .flag_if_supported("-mno-unaligned-access");
             ggml.flag_if_supported("-mfp16-format=ieee")
                 .flag_if_supported("-mno-unaligned-access");
             llama_cpp
@@ -42,21 +47,22 @@ fn main() {
                 .flag_if_supported("-mno-unaligned-access");
         }
 
-
-        ggml
+        ggml_cuda
             .cuda(true)
-            .std("c++17")
             .flag("-arch=all")
-            .file("llama.cpp/ggml-cuda.cu");
+            .file("llama.cpp/ggml-cuda.cu")
+            .include("llama.cpp");
 
         if ggml_cuda.get_compiler().is_like_msvc() {
+            // someone with windows should check if this works @ cc++11
+            // this case was added when we used c++17 (which was not what llama.cpp used)
             ggml_cuda.std("c++14");
         } else {
-            ggml_cuda.std("c++17");
+            ggml_cuda.std("c++11");
         }
 
         ggml.define("GGML_USE_CUBLAS", None);
-        ggml.define("GGML_USE_CUBLAS", None);
+        ggml_cuda.define("GGML_USE_CUBLAS", None);
         llama_cpp.define("GGML_USE_CUBLAS", None);
     }
 
@@ -90,7 +96,7 @@ fn main() {
         ggml.define("_GNU_SOURCE", None);
     }
 
-    ggml.std("c17")
+    ggml.std("c11")
         .include("./llama.cpp")
         .file("llama.cpp/ggml.c")
         .file("llama.cpp/ggml-alloc.c")
@@ -101,14 +107,22 @@ fn main() {
     llama_cpp
         .define("_XOPEN_SOURCE", Some("600"))
         .include("llama.cpp")
-        .std("c++17")
+        .std("c++11")
         .file("llama.cpp/llama.cpp");
 
+    if let Some(ggml_cuda) = ggml_cuda {
+        println!("compiling ggml-cuda");
+        ggml_cuda.compile("ggml-cuda");
+        println!("compiled ggml-cuda");
+    }
+
     println!("compiling ggml");
     ggml.compile("ggml");
+    println!("compiled ggml");
 
     println!("compiling llama");
     llama_cpp.compile("llama");
+    println!("compiled llama");
 
     let header = "llama.cpp/llama.h";
 

From b6e0bf72e914b45b6ab2c71279dcb514f022824f Mon Sep 17 00:00:00 2001
From: marcus <marcus.s.dunn@gmail.com>
Date: Sun, 25 Feb 2024 10:49:16 -0800
Subject: [PATCH 6/6] fix for docker build

---
 llama-cpp-sys-2/build.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index c223bd61..b658e0de 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -54,11 +54,11 @@ fn main() {
             .include("llama.cpp");
 
         if ggml_cuda.get_compiler().is_like_msvc() {
-            // someone with windows should check if this works @ cc++11
-            // this case was added when we used c++17 (which was not what llama.cpp used)
             ggml_cuda.std("c++14");
         } else {
-            ggml_cuda.std("c++11");
+            ggml_cuda
+                .flag("-std=c++11")
+                .std("c++11");
         }
 
         ggml.define("GGML_USE_CUBLAS", None);