Merge remote-tracking branch 'ggerganov/master' into fix_decoding

* ggerganov/master: (40 commits) revert : cmake : set MSVC to use UTF-8 on source files (ggerganov#2346) sync : ggml ggml: fix ggml_graph_cpy undefined behavior (ggml/943) cann : fix doxy (ggml/0) vulkan : fix build (llama/0) cuda : mark BF16 CONT as unsupported ggml : fix cont with transposed tensors when one dimension is 1 (ggml/934) cmake : set MSVC to use UTF-8 on source files (ggerganov#2346) readme : remove invalid flag from Python example (ggerganov#2396) readme : fix link (ggerganov#2394) go : add beamsize/entropythold/maxcontext to context interface (ggerganov#2350) talk-llama : sync llama.cpp whisper : update FA call sync : ggml sync : vulkan (skip) (llama/0) ggml : do not crash when quantizing q4_x_x with an imatrix (llama/9192) metal : separate scale and mask from QKT in FA kernel (llama/9189) ggml : add SSM Metal kernels (llama/8546) metal : gemma2 flash attention support (llama/9159) CPU/CUDA: Gemma 2 FlashAttention support (llama/8542) ...
bygreencn · Sep 3, 2024 · 6c089cd · 6c089cd
2 parents b2f5a0a + 5236f02
commit 6c089cd
Show file tree

Hide file tree

Showing 68 changed files with 4,777 additions and 2,447 deletions.
diff --git a/Makefile b/Makefile
@@ -971,7 +971,8 @@ $(LIB_WHISPER): \
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 $(LIB_WHISPER_S): \
-	$(OBJ_WHISPER)
+	$(OBJ_WHISPER) \
+	$(OBJ_GGML)
 	ar rcs $(LIB_WHISPER_S) $^
 
 # common

diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - Support for CPU-only inference
 - [Efficient GPU support for NVIDIA](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
 - [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
-- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
+- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/include/whisper.h)
 
 Supported platforms:
 
@@ -33,7 +33,7 @@ Supported platforms:
 - [x] [WebAssembly](examples/whisper.wasm)
 - [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
 - [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
-- [x] [docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
+- [x] [Docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
 
 The entire high-level implementation of the model is contained in [whisper.h](include/whisper.h) and [whisper.cpp](src/whisper.cpp).
 The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
@@ -55,8 +55,8 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
 
 ## Implementation details
 
-- The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
-- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](whisper.h) / [whisper.cpp](whisper.cpp))
+- The core tensor operations are implemented in C ([ggml.h](ggml/include/ggml.h) / [ggml.c](ggml/src/ggml.c))
+- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](include/whisper.h) / [whisper.cpp](src/whisper.cpp))
 - Sample usage is demonstrated in [main.cpp](examples/main)
 - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
 - Various other examples are available in the [examples](examples) folder
@@ -751,7 +751,7 @@ took to execute it. The results are summarized in the following Github issue:
 
 [Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
 
-Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](bench.py).
+Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](scripts/bench.py).
 
 You can run it with the following command, by default it will run against any standard model in the models folder.
 
@@ -798,6 +798,7 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model
   - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
   - [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp)
   - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
+  - [abdeladim-s/pywhispercpp](https://github.com/abdeladim-s/pywhispercpp) (Pybind11)
 - [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
 - [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
 

diff --git a/bindings/go/Makefile b/bindings/go/Makefile
@@ -14,7 +14,7 @@ GGML_METAL_PATH_RESOURCES := $(abspath ../..)
 BUILD_DIR := build
 MODELS_DIR := models
 EXAMPLES_DIR := $(wildcard examples/*)
-INCLUDE_PATH := $(abspath ../..)
+INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include)
 LIBRARY_PATH := $(abspath ../..)
 
 ifeq ($(UNAME_S),Darwin)

diff --git a/bindings/go/params.go b/bindings/go/params.go
@@ -115,6 +115,18 @@ func (p *Params) SetAudioCtx(n int) {
 	p.audio_ctx = C.int(n)
 }
 
+func (p *Params) SetMaxContext(n int) {
+	p.n_max_text_ctx = C.int(n)
+}
+
+func (p *Params) SetBeamSize(n int) {
+	p.beam_search.beam_size = C.int(n)
+}
+
+func (p *Params) SetEntropyThold(t float32) {
+	p.entropy_thold = C.float(t)
+}
+
 // Set initial prompt
 func (p *Params) SetInitialPrompt(prompt string) {
 	p.initial_prompt = C.CString(prompt)
@@ -145,6 +157,8 @@ func (p *Params) String() string {
 	str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
 	str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
 	str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt))
+	str += fmt.Sprintf(" entropy_thold=%f", p.entropy_thold)
+	str += fmt.Sprintf(" beam_size=%d", p.beam_search.beam_size)
 	if p.translate {
 		str += " translate"
 	}

diff --git a/bindings/go/pkg/whisper/context.go b/bindings/go/pkg/whisper/context.go
@@ -125,6 +125,21 @@ func (context *context) SetAudioCtx(n uint) {
 	context.params.SetAudioCtx(int(n))
 }
 
+// Set maximum number of text context tokens to store
+func (context *context) SetMaxContext(n int) {
+	context.params.SetMaxContext(n)
+}
+
+// Set Beam Size
+func (context *context) SetBeamSize(n int) {
+	context.params.SetBeamSize(n)
+}
+
+// Set Entropy threshold
+func (context *context) SetEntropyThold(t float32) {
+	context.params.SetEntropyThold(t)
+}
+
 // Set initial prompt
 func (context *context) SetInitialPrompt(prompt string) {
 	context.params.SetInitialPrompt(prompt)

diff --git a/bindings/go/pkg/whisper/interface.go b/bindings/go/pkg/whisper/interface.go
@@ -48,6 +48,9 @@ type Context interface {
 	SetTokenTimestamps(bool)        // Set token timestamps flag
 	SetMaxTokensPerSegment(uint)    // Set max tokens per segment (0 = no limit)
 	SetAudioCtx(uint)               // Set audio encoder context
+	SetMaxContext(n int)            // Set maximum number of text context tokens to store
+	SetBeamSize(n int)              // Set Beam Size
+	SetEntropyThold(t float32)      // Set Entropy threshold
 	SetInitialPrompt(prompt string) // Set initial prompt
 
 	// Process mono audio data and return any errors.

diff --git a/bindings/go/whisper.go b/bindings/go/whisper.go
@@ -9,7 +9,7 @@ import (
 // CGO
 
 /*
-#cgo LDFLAGS: -lwhisper -lm -lstdc++
+#cgo LDFLAGS: -lwhisper -lm -lstdc++ -fopenmp
 #cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
 #include <whisper.h>
 #include <stdlib.h>

diff --git a/examples/python/whisper_processor.py b/examples/python/whisper_processor.py
@@ -21,7 +21,7 @@ def process_audio(wav_file, model_name="base.en"):
     if not os.path.exists(wav_file):
         raise FileNotFoundError(f"WAV file not found: {wav_file}")
 
-    full_command = f"./main -m {model} -f {wav_file} -np -nt"
+    full_command = f"./main -m {model} -f {wav_file} -nt"
 
     # Execute the command
     process = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

diff --git a/examples/talk-llama/llama-impl.h b/examples/talk-llama/llama-impl.h
@@ -24,3 +24,24 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+
+//
+// helpers
+//
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    if (search.empty()) {
+        return;
+    }
+    std::string builder;
+    builder.reserve(s.length());
+    size_t pos = 0;
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
+    }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
+}
diff --git a/examples/talk-llama/llama-sampling.cpp b/examples/talk-llama/llama-sampling.cpp
@@ -85,14 +85,14 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra
             constexpr float bucket_low   = -10.0f;
             constexpr float bucket_high  =  10.0f;
             constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
-            constexpr float bucker_inter = -bucket_low * bucket_scale;
+            constexpr float bucket_inter = -bucket_low * bucket_scale;
 
             std::vector<int> bucket_idx(candidates->size);
             std::vector<int> histo(nbuckets, 0);
 
             for (int i = 0; i < (int)candidates->size; ++i) {
                 const float val = candidates->data[i].logit;
-                int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
+                int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
                 ib = std::max(0, std::min(nbuckets-1, ib));
                 bucket_idx[i] = ib;
                 ++histo[ib];

diff --git a/examples/talk-llama/llama-vocab.cpp b/examples/talk-llama/llama-vocab.cpp
@@ -16,20 +16,6 @@
 // helpers
 //
 
-static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    std::string result;
-    for (size_t pos = 0; ; pos += search.length()) {
-        auto new_pos = s.find(search, pos);
-        if (new_pos == std::string::npos) {
-            result += s.substr(pos, s.size() - pos);
-            break;
-        }
-        result += s.substr(pos, new_pos - pos) + replace;
-        pos = new_pos;
-    }
-    s = std::move(result);
-}
-
 LLAMA_ATTRIBUTE_FORMAT(1, 2)
 static std::string format(const char * fmt, ...) {
     va_list ap;
@@ -335,6 +321,21 @@ struct llm_tokenizer_spm {
 
 // TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
 
+template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
+class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
+public:
+    using std::priority_queue<T, Container, Compare>::priority_queue;
+
+    T pop_move() {
+        T item = std::move(this->c.front());
+        std::pop_heap(this->c.begin(), this->c.end(), this->comp);
+        this->c.pop_back();
+        return item;
+    }
+
+    void pop() =  delete;
+};
+
 struct llm_bigram_bpe {
     struct comparator {
         bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
@@ -343,7 +344,7 @@ struct llm_bigram_bpe {
     };
 
     using queue_storage = std::vector<llm_bigram_bpe>;
-    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
+    using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
     llm_symbol::index left;
     llm_symbol::index right;
     std::string text;
@@ -402,6 +403,7 @@ struct llm_tokenizer_bpe {
             case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
             case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
             case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
+            case LLAMA_VOCAB_PRE_TYPE_EXAONE:
                 regex_exprs = {
                     "\\p{N}",
                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -424,6 +426,8 @@ struct llm_tokenizer_bpe {
                 };
                 break;
             case LLAMA_VOCAB_PRE_TYPE_PORO:
+            case LLAMA_VOCAB_PRE_TYPE_BLOOM:
+            case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
                 regex_exprs = {
                     " ?[^(\\s|.,!?…。，、।۔،)]+",
                 };
@@ -531,8 +535,7 @@ struct llm_tokenizer_bpe {
 
             // build token(s)
             while (!work_queue.empty()) {
-                auto bigram = work_queue.top();
-                work_queue.pop();
+                auto bigram = work_queue.pop_move();
 
                 auto & left_symbol = symbols[bigram.left];
                 auto & right_symbol = symbols[bigram.right];
@@ -1480,11 +1483,11 @@ llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
     return vocab.special_pad_id;
 }
 
-int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab) {
+bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
     return vocab.tokenizer_add_bos;
 }
 
-int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab) {
+bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
     return vocab.tokenizer_add_eos;
 }
 

diff --git a/examples/talk-llama/llama-vocab.h b/examples/talk-llama/llama-vocab.h
@@ -95,8 +95,8 @@ llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
 llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
 llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
 
-int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab);
-int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab);
+bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
+bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
 
 llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab);