Bring talk example up to date using the latest gpt-2 impl from ggml #2384

shivghai · 2024-08-25T21:21:20Z

https://github.com/users/ggerganov/projects/7?pane=issue&itemId=51613670

My attempt at porting over to the latest gpt-2 implementation from ggml. I'm a ggml + whisper.cpp noob but wanted to get this to work and looks like it works now

A few notes:

Sets default params for n_ctx [=2048] and n_gpu_layers [=0] to defaults mentioned in https://github.com/ggerganov/ggml/blob/master/examples/common.h#L19
I chmod +xed examples/talk/speak to get this to work, though I'm not too sure that's good practice to check it in that way
Code style is definitely off - gpt-2.h declares a lot of internals right now, happy to clear that up pending feedback. Seems like a lot of it is needed given that we need to know the structure of gpt2_context since in talk.cpp we use allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(ctx_gpt->model.backend)); and an incomplete forward declaration will not be enough
Does not update talk.wasm (yet)

Runs on M1 MBP

ggerganov · 2024-10-05T12:50:27Z

Thanks for looking into this and sorry for the long delay.

You can hide the internals with something like this:

diff --git a/examples/talk/gpt-2.cpp b/examples/talk/gpt-2.cpp
index f1638e8..f07b01e 100644
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@@ -1,10 +1,9 @@
+#include "gpt-2.h"
+
 #include "ggml.h"
-#include "common-ggml.h"
 #include "ggml-backend.h"
 #include "ggml-alloc.h"
 
-#include "gpt-2.h"
-
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif
@@ -13,6 +12,7 @@
 #include "ggml-metal.h"
 #endif
 
+#include <map>
 
 #define GPT2_MAX_NODES 4096
 
@@ -23,10 +23,100 @@ static void ggml_log_callback_default(ggml_log_level level, const char * text, v
     fflush(stderr);
 }
 
+struct gpt2_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // attention
+    struct ggml_tensor * c_attn_attn_w;
+    struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // mlp
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+};
+
+constexpr int N_THREAD = 8;
+
+// default hparams (GPT-2 117M)
+struct gpt2_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 1024;
+    int32_t n_embd  = 768;
+    int32_t n_head  = 12;
+    int32_t n_layer = 12;
+    int32_t ftype   = 1;
+    float   eps     = 1e-5f;
+};
+
+struct gpt2_model {
+    gpt2_hparams hparams;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte;     // position embedding
+    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head
+
+    std::vector<gpt2_layer> layers;
 
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx_w;
+    struct ggml_context * ctx_kv;
+
+    ggml_backend* backend = NULL;
+
+    ggml_backend_buffer * buffer_w;
+    ggml_backend_buffer * buffer_kv;
+
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+
+struct gpt2_context {
+    std::string prompt_base = R"(Hello, how are you?
+I'm fine, thanks. How are you?
+Thanks, I'm fine too. What are you doing?
+I'm just sitting here.
+It's a lovely day, isn't it?
+Yes, it is. I love the weather this time of year.
+I wish it would rain a little bit.
+Me too.
+)";
+
+    std::mt19937 rng;
+
+    gpt_vocab vocab;
+    gpt2_model model;
+
+    ggml_gallocr_t allocr = NULL;
+
+    int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
+
+    // sampling parameters
+    int32_t top_k = 5;
+    float   top_p = 0.9f;
+    float   temp  = 1.0f;
+};
 
 // load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, int n_ctx = 2048, int n_gpu_layers = 0) {
+static bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, int n_ctx = 2048, int n_gpu_layers = 0) {
     printf("%s: loading model from '%s'\n", __func__, fname.c_str());
 
     auto fin = std::ifstream(fname, std::ios::binary);
@@ -386,12 +476,16 @@ struct gpt2_context * gpt2_init(const char * path_model) {
         printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
     }
 
+    // create a graph allocator with the backend's default buffer type
+    ctx->allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(ctx->model.backend));
+
     return ctx;
 }
 
 
 
 void gpt2_free(struct gpt2_context * ctx) {
+    ggml_gallocr_free(ctx->allocr);
     delete ctx;
 }
 
@@ -409,7 +503,7 @@ std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char *
 
 
 // build the computation graph
-struct ggml_cgraph * gpt2_graph(
+static struct ggml_cgraph * gpt2_graph(
         const gpt2_model & model,
         const int n_past,
         const int n_tokens) {
@@ -697,12 +791,14 @@ struct ggml_cgraph * gpt2_graph(
 //   - embd_w:    the predicted logits for the next token
 //
 bool gpt2_eval(
-        const gpt2_model & model,
-        ggml_gallocr_t allocr,
+        const gpt2_context * ctx,
         const int n_threads,
         const int n_past,
         const std::vector<gpt_vocab::id> & embd_inp,
               std::vector<float>         & embd_w) {
+    auto & model = ctx->model;
+    auto & allocr = ctx->allocr;
+
     const int N = embd_inp.size();
 
     const auto & hparams = model.hparams;
@@ -757,7 +853,7 @@ bool gpt2_eval(
 }
 
 
-std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens, ggml_gallocr* allocr) {
+std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens) {
     int n_past = 0;
 
     std::vector<float> embd_w;
@@ -774,7 +870,7 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens,
     for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
 
         if (!embd.empty()) {
-            if (!gpt2_eval(ctx->model, allocr, ctx->n_threads, n_past, embd, embd_w)) {
+            if (!gpt2_eval(ctx, ctx->n_threads, n_past, embd, embd_w)) {
                 printf("gpt-2: failed to generate text\n");
                 return "";
             }
diff --git a/examples/talk/gpt-2.h b/examples/talk/gpt-2.h
index dc5cbdb..8a8027c 100644
--- a/examples/talk/gpt-2.h
+++ b/examples/talk/gpt-2.h
@@ -5,103 +5,9 @@
 #include "common.h"
 
 #include <vector>
-#include <map>
 #include <string>
-#include <regex>
 
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-constexpr int N_THREAD = 8;
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx_w;
-    struct ggml_context * ctx_kv;
-
-    ggml_backend* backend = NULL;
-
-    ggml_backend_buffer * buffer_w;
-    ggml_backend_buffer * buffer_kv;
-
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-
-struct gpt2_context {
-    std::string prompt_base = R"(Hello, how are you?
-I'm fine, thanks. How are you?
-Thanks, I'm fine too. What are you doing?
-I'm just sitting here.
-It's a lovely day, isn't it?
-Yes, it is. I love the weather this time of year.
-I wish it would rain a little bit.
-Me too.
-)";
-
-    std::mt19937 rng;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
-
-    // sampling parameters
-    int32_t top_k = 5;
-    float   top_p = 0.9f;
-    float   temp  = 1.0f;
-};
-
-bool gpt2_model_load(const std::string &fname, gpt2_model &model, gpt_vocab &vocab, int n_ctx, int n_gpu_layers);
-struct gpt2_context *gpt2_init(const char *path_model);
+struct gpt2_context * gpt2_init(const char *path_model);
 void gpt2_free(struct gpt2_context * ctx);
 
 const char * gpt2_get_prompt(struct gpt2_context * ctx);
@@ -109,17 +15,11 @@ void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
 
 std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
 
-
-std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens, ggml_gallocr* allocr);
-struct ggml_cgraph *gpt2_graph(
-    const gpt2_model &model,
-    const int n_past,
-    const int n_tokens);
+std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);
 
 bool gpt2_eval(
-    const gpt2_model &model,
-    ggml_gallocr_t allocr,
+    const gpt2_context * ctx,
     const int n_threads,
     const int n_past,
     const std::vector<gpt_vocab::id> &embd_inp,
-    std::vector<float> &embd_w);
\ No newline at end of file
+    std::vector<float> &embd_w);
diff --git a/examples/talk/talk.cpp b/examples/talk/talk.cpp
index c116617..116a257 100644
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@@ -244,12 +244,6 @@ int main(int argc, char ** argv) {
     std::vector<float> pcmf32_prompt;
 
     gpt2_set_prompt(ctx_gpt, "");
-    ggml_gallocr_t allocr = NULL;
-    // allocate the compute buffer
-    {
-        // create a graph allocator with the backend's default buffer type
-        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(ctx_gpt->model.backend));
-    }
 
     const int voice_id = rand()%6;
 
@@ -330,7 +324,7 @@ int main(int argc, char ** argv) {
 
                     std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
 
-                    text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens, allocr);
+                    text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
                     //text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
                     text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of('\n'));

Nevertheless, I've been looking a bit at the talk example today, and I am considering removing it all together as I think it is completely superseded by the talk-llama example. In the interest of reducing maintenance efforts it seems better to remove it. Sorry that you had to invest time into this PR - hope it was at least fun and you enjoyed it.

Bring talk example up to date using the latest gpt-2 impl from ggml

2651c3e

shivghai mentioned this pull request Aug 25, 2024

whisper : update the "talk" example #1818

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Bring talk example up to date using the latest gpt-2 impl from ggml #2384

Bring talk example up to date using the latest gpt-2 impl from ggml #2384

shivghai commented Aug 25, 2024 •

edited

Loading

ggerganov commented Oct 5, 2024

Bring talk example up to date using the latest gpt-2 impl from ggml #2384

Are you sure you want to change the base?

Bring talk example up to date using the latest gpt-2 impl from ggml #2384

Conversation

shivghai commented Aug 25, 2024 • edited Loading

ggerganov commented Oct 5, 2024

shivghai commented Aug 25, 2024 •

edited

Loading