-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bring talk example up to date using the latest gpt-2 impl from ggml #2384
base: master
Are you sure you want to change the base?
Conversation
Thanks for looking into this and sorry for the long delay. You can hide the internals with something like this: diff --git a/examples/talk/gpt-2.cpp b/examples/talk/gpt-2.cpp
index f1638e8..f07b01e 100644
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@@ -1,10 +1,9 @@
+#include "gpt-2.h"
+
#include "ggml.h"
-#include "common-ggml.h"
#include "ggml-backend.h"
#include "ggml-alloc.h"
-#include "gpt-2.h"
-
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
@@ -13,6 +12,7 @@
#include "ggml-metal.h"
#endif
+#include <map>
#define GPT2_MAX_NODES 4096
@@ -23,10 +23,100 @@ static void ggml_log_callback_default(ggml_log_level level, const char * text, v
fflush(stderr);
}
+struct gpt2_layer {
+ // normalization
+ struct ggml_tensor * ln_1_g;
+ struct ggml_tensor * ln_1_b;
+
+ struct ggml_tensor * ln_2_g;
+ struct ggml_tensor * ln_2_b;
+
+ // attention
+ struct ggml_tensor * c_attn_attn_w;
+ struct ggml_tensor * c_attn_attn_b;
+
+ struct ggml_tensor * c_attn_proj_w;
+ struct ggml_tensor * c_attn_proj_b;
+
+ // mlp
+ struct ggml_tensor * c_mlp_fc_w;
+ struct ggml_tensor * c_mlp_fc_b;
+
+ struct ggml_tensor * c_mlp_proj_w;
+ struct ggml_tensor * c_mlp_proj_b;
+};
+
+constexpr int N_THREAD = 8;
+
+// default hparams (GPT-2 117M)
+struct gpt2_hparams {
+ int32_t n_vocab = 50257;
+ int32_t n_ctx = 1024;
+ int32_t n_embd = 768;
+ int32_t n_head = 12;
+ int32_t n_layer = 12;
+ int32_t ftype = 1;
+ float eps = 1e-5f;
+};
+
+struct gpt2_model {
+ gpt2_hparams hparams;
+
+ // normalization
+ struct ggml_tensor * ln_f_g;
+ struct ggml_tensor * ln_f_b;
+
+ struct ggml_tensor * wte; // position embedding
+ struct ggml_tensor * wpe; // token embedding
+ struct ggml_tensor * lm_head; // language model head
+
+ std::vector<gpt2_layer> layers;
+ // key + value memory
+ struct ggml_tensor * memory_k;
+ struct ggml_tensor * memory_v;
+
+ //
+ struct ggml_context * ctx_w;
+ struct ggml_context * ctx_kv;
+
+ ggml_backend* backend = NULL;
+
+ ggml_backend_buffer * buffer_w;
+ ggml_backend_buffer * buffer_kv;
+
+ std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+
+struct gpt2_context {
+ std::string prompt_base = R"(Hello, how are you?
+I'm fine, thanks. How are you?
+Thanks, I'm fine too. What are you doing?
+I'm just sitting here.
+It's a lovely day, isn't it?
+Yes, it is. I love the weather this time of year.
+I wish it would rain a little bit.
+Me too.
+)";
+
+ std::mt19937 rng;
+
+ gpt_vocab vocab;
+ gpt2_model model;
+
+ ggml_gallocr_t allocr = NULL;
+
+ int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
+
+ // sampling parameters
+ int32_t top_k = 5;
+ float top_p = 0.9f;
+ float temp = 1.0f;
+};
// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, int n_ctx = 2048, int n_gpu_layers = 0) {
+static bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, int n_ctx = 2048, int n_gpu_layers = 0) {
printf("%s: loading model from '%s'\n", __func__, fname.c_str());
auto fin = std::ifstream(fname, std::ios::binary);
@@ -386,12 +476,16 @@ struct gpt2_context * gpt2_init(const char * path_model) {
printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
}
+ // create a graph allocator with the backend's default buffer type
+ ctx->allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(ctx->model.backend));
+
return ctx;
}
void gpt2_free(struct gpt2_context * ctx) {
+ ggml_gallocr_free(ctx->allocr);
delete ctx;
}
@@ -409,7 +503,7 @@ std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char *
// build the computation graph
-struct ggml_cgraph * gpt2_graph(
+static struct ggml_cgraph * gpt2_graph(
const gpt2_model & model,
const int n_past,
const int n_tokens) {
@@ -697,12 +791,14 @@ struct ggml_cgraph * gpt2_graph(
// - embd_w: the predicted logits for the next token
//
bool gpt2_eval(
- const gpt2_model & model,
- ggml_gallocr_t allocr,
+ const gpt2_context * ctx,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
std::vector<float> & embd_w) {
+ auto & model = ctx->model;
+ auto & allocr = ctx->allocr;
+
const int N = embd_inp.size();
const auto & hparams = model.hparams;
@@ -757,7 +853,7 @@ bool gpt2_eval(
}
-std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens, ggml_gallocr* allocr) {
+std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens) {
int n_past = 0;
std::vector<float> embd_w;
@@ -774,7 +870,7 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens,
for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
if (!embd.empty()) {
- if (!gpt2_eval(ctx->model, allocr, ctx->n_threads, n_past, embd, embd_w)) {
+ if (!gpt2_eval(ctx, ctx->n_threads, n_past, embd, embd_w)) {
printf("gpt-2: failed to generate text\n");
return "";
}
diff --git a/examples/talk/gpt-2.h b/examples/talk/gpt-2.h
index dc5cbdb..8a8027c 100644
--- a/examples/talk/gpt-2.h
+++ b/examples/talk/gpt-2.h
@@ -5,103 +5,9 @@
#include "common.h"
#include <vector>
-#include <map>
#include <string>
-#include <regex>
-
-struct gpt2_layer {
- // normalization
- struct ggml_tensor * ln_1_g;
- struct ggml_tensor * ln_1_b;
-
- struct ggml_tensor * ln_2_g;
- struct ggml_tensor * ln_2_b;
-
- // attention
- struct ggml_tensor * c_attn_attn_w;
- struct ggml_tensor * c_attn_attn_b;
-
- struct ggml_tensor * c_attn_proj_w;
- struct ggml_tensor * c_attn_proj_b;
-
- // mlp
- struct ggml_tensor * c_mlp_fc_w;
- struct ggml_tensor * c_mlp_fc_b;
-
- struct ggml_tensor * c_mlp_proj_w;
- struct ggml_tensor * c_mlp_proj_b;
-};
-
-constexpr int N_THREAD = 8;
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
- int32_t n_vocab = 50257;
- int32_t n_ctx = 1024;
- int32_t n_embd = 768;
- int32_t n_head = 12;
- int32_t n_layer = 12;
- int32_t ftype = 1;
- float eps = 1e-5f;
-};
-
-struct gpt2_model {
- gpt2_hparams hparams;
-
- // normalization
- struct ggml_tensor * ln_f_g;
- struct ggml_tensor * ln_f_b;
-
- struct ggml_tensor * wte; // position embedding
- struct ggml_tensor * wpe; // token embedding
- struct ggml_tensor * lm_head; // language model head
-
- std::vector<gpt2_layer> layers;
-
- // key + value memory
- struct ggml_tensor * memory_k;
- struct ggml_tensor * memory_v;
-
- //
- struct ggml_context * ctx_w;
- struct ggml_context * ctx_kv;
-
- ggml_backend* backend = NULL;
-
- ggml_backend_buffer * buffer_w;
- ggml_backend_buffer * buffer_kv;
-
- std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-
-struct gpt2_context {
- std::string prompt_base = R"(Hello, how are you?
-I'm fine, thanks. How are you?
-Thanks, I'm fine too. What are you doing?
-I'm just sitting here.
-It's a lovely day, isn't it?
-Yes, it is. I love the weather this time of year.
-I wish it would rain a little bit.
-Me too.
-)";
-
- std::mt19937 rng;
-
- gpt_vocab vocab;
- gpt2_model model;
-
- int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
-
- // sampling parameters
- int32_t top_k = 5;
- float top_p = 0.9f;
- float temp = 1.0f;
-};
-
-bool gpt2_model_load(const std::string &fname, gpt2_model &model, gpt_vocab &vocab, int n_ctx, int n_gpu_layers);
-struct gpt2_context *gpt2_init(const char *path_model);
+struct gpt2_context * gpt2_init(const char *path_model);
void gpt2_free(struct gpt2_context * ctx);
const char * gpt2_get_prompt(struct gpt2_context * ctx);
@@ -109,17 +15,11 @@ void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
-
-std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens, ggml_gallocr* allocr);
-struct ggml_cgraph *gpt2_graph(
- const gpt2_model &model,
- const int n_past,
- const int n_tokens);
+std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);
bool gpt2_eval(
- const gpt2_model &model,
- ggml_gallocr_t allocr,
+ const gpt2_context * ctx,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> &embd_inp,
- std::vector<float> &embd_w);
\ No newline at end of file
+ std::vector<float> &embd_w);
diff --git a/examples/talk/talk.cpp b/examples/talk/talk.cpp
index c116617..116a257 100644
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@@ -244,12 +244,6 @@ int main(int argc, char ** argv) {
std::vector<float> pcmf32_prompt;
gpt2_set_prompt(ctx_gpt, "");
- ggml_gallocr_t allocr = NULL;
- // allocate the compute buffer
- {
- // create a graph allocator with the backend's default buffer type
- allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(ctx_gpt->model.backend));
- }
const int voice_id = rand()%6;
@@ -330,7 +324,7 @@ int main(int argc, char ** argv) {
std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
- text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens, allocr);
+ text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
//text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of('\n'));
Nevertheless, I've been looking a bit at the |
https://github.com/users/ggerganov/projects/7?pane=issue&itemId=51613670
My attempt at porting over to the latest gpt-2 implementation from ggml. I'm a ggml + whisper.cpp noob but wanted to get this to work and looks like it works now
A few notes:
n_ctx
[=2048] andn_gpu_layers
[=0] to defaults mentioned in https://github.com/ggerganov/ggml/blob/master/examples/common.h#L19chmod +x
edexamples/talk/speak
to get this to work, though I'm not too sure that's good practice to check it in that waygpt-2.h
declares a lot of internals right now, happy to clear that up pending feedback. Seems like a lot of it is needed given that we need to know the structure ofgpt2_context
since intalk.cpp
we useallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(ctx_gpt->model.backend));
and an incomplete forward declaration will not be enoughRuns on M1 MBP