diff --git a/demo/cli_demo.cpp b/demo/cli_demo.cpp index 450c1313..eb51a249 100644 --- a/demo/cli_demo.cpp +++ b/demo/cli_demo.cpp @@ -25,14 +25,12 @@ void benchmark(Llm* llm, std::string prompt_file) { int decode_len = 0; int64_t prefill_time = 0; int64_t decode_time = 0; - llm->warmup(); for (int i = 0; i < prompts.size(); i++) { llm->response(prompts[i]); prompt_len += llm->prompt_len_; decode_len += llm->gen_seq_len_; prefill_time += llm->prefill_us_; decode_time += llm->decode_us_; - llm->reset(); } float prefill_s = prefill_time / 1e6; float decode_s = decode_time / 1e6; diff --git a/demo/tokenizer_demo.cpp b/demo/tokenizer_demo.cpp index 6b13d466..56ae1601 100644 --- a/demo/tokenizer_demo.cpp +++ b/demo/tokenizer_demo.cpp @@ -13,20 +13,20 @@ int main(int argc, const char* argv[]) { return 0; } std::string tokenizer_path = argv[1]; - std::unique_ptr tokenizer_(new Tiktoken); - tokenizer_->load(tokenizer_path); + std::unique_ptr tokenizer(Tokenizer::createTokenizer(tokenizer_path)); const std::string system_str = "Youare a helpful assistant."; const std::string user_str = "Hello"; // const std::string query = "\n<|im_start|>system\n" + system_str + "<|im_end|>\n<|im_start|>\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n"; const std::string query = "\n<|im_start|>user\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n"; + // const std::string query = "<|im_start|>user\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n"; // const std::string query = system_str + "\n" + user_str; - auto tokens = tokenizer_->encode(query); + auto tokens = tokenizer->encode(query); std::string decode_str; printf("encode tokens = [ "); for (auto token : tokens) { printf("%d, ", token); - decode_str += tokenizer_->decode(token); + decode_str += tokenizer->decode(token); } printf("]\n"); printf("decode str = %s\n", decode_str.c_str()); diff --git a/include/llm.hpp b/include/llm.hpp index 95b4b59e..412d5893 100644 --- a/include/llm.hpp +++ b/include/llm.hpp @@ -81,10 +81,6 @@ class LlmConfig { return config_.value("model_type", "unknow"); } - std::string tokenizer_type() const { - return config_.value("tokenizer_type", "tiktoken"); - } - std::string llm_model() const { return base_dir_ + config_.value("llm_model", "llm.mnn"); } @@ -93,6 +89,18 @@ class LlmConfig { return base_dir_ + config_.value("llm_weight", "llm.mnn.weight"); } + std::string block_model(int index) const { + return base_dir_ + config_.value("block_model", "block_") + std::to_string(index) + ".mnn"; + } + + std::string lm_model() const { + return base_dir_ + config_.value("lm_model", "lm.mnn"); + } + + std::string embedding_model() const { + return base_dir_ + config_.value("embedding_model", "embedding.mnn"); + } + std::string embedding_file() const { return base_dir_ + config_.value("embedding_file", "embeddings_bf16.bin"); } @@ -101,16 +109,28 @@ class LlmConfig { return base_dir_ + config_.value("tokenizer_file", "tokenizer.txt"); } + bool is_single() const { + return config_.value("is_single", true); + } + + int max_new_tokens() const { + return config_.value("max_new_tokens", 512); + } + int hidden_size() const { return config_.value("hidden_size", 4096); } + int layer_nums() const { + return config_.value("layer_nums", 32); + } + std::vector key_value_shape() const { return config_.value("key_value_shape", std::vector{}); } - std::vector stop_ids() const { - return config_.value("stop_ids", std::vector{}); + std::string attention_mask() const { + return config_.value("attention_mask", "int"); } std::string prompt_template() const { @@ -139,10 +159,7 @@ class LlmConfig { class Llm { public: - Llm() { - // default tokenier is senrencepiece - tokenizer_.reset(new Sentencepiece); - } + Llm(std::shared_ptr config) : config_(config) {} virtual ~Llm() { modules_.clear(); visual_module_.reset(); @@ -151,26 +168,25 @@ class Llm { static Llm* createLLM(const std::string& path, std::string model_type = "auto"); void load(); void chat(); - void warmup(); + int forward(const std::vector& input_ids); + std::string apply_chat_template(const std::string& input_str) const; std::string response(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr); - std::string response_nohistory(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr); void generate_init(); std::string generate(const std::vector& input_ids, std::ostream* os, const char* end_with); - std::vector generate(const std::vector& input_ids); - int forward(const std::vector& input_ids); - float load_progress() { return load_progress_; } - void reset(); + std::vector generate(const std::vector& input_ids, int max_new_tokens = -1); void print_speed(); friend class Pipeline; public: - std::vector history_; + // TODO std::string model_name_ = ""; + bool is_single_ = true; + bool is_disk_embedding_ = true; + bool is_visual_ = false; + int layer_nums_ = 0; + int hidden_size_ = 4096; // config int max_new_tokens_ = 1024; int backend_type_ = 0; - int thread_num_ = 4; - bool low_precision_ = true; - bool chatml_ = true; // forward info int prompt_len_ = 0; int gen_seq_len_ = 0; @@ -178,41 +194,27 @@ class Llm { // time int64_t prefill_us_ = 0; int64_t decode_us_ = 0; - LlmConfig config_; + std::shared_ptr config_; + std::unique_ptr tokenizer_; protected: VARP embedding(const std::vector& input_ids); VARP txt_embedding(const std::vector& input_ids); - std::vector tokenizer_encode(const std::string& input_str); std::string decode(int id); protected: - VARP inputs_embeds_, attention_mask_, position_ids_; - // model configs - bool is_single_ = true; - bool is_disk_embedding_ = true; - bool is_visual_ = false; - int layer_nums_ = 0; - int hidden_size_ = 4096; std::vector key_value_shape_ = {}; - // gen info - float load_progress_ = 0.f; - // tokenizer - std::unique_ptr tokenizer_; + VARP inputs_embeds_, attention_mask_, position_ids_; std::shared_ptr visual_module_; + std::shared_ptr runtime_manager_; + std::vector> modules_; + std::vector past_key_values_; private: virtual VARP visual_embedding(const std::vector& input_ids) { return nullptr; } - virtual std::vector tokenizer(const std::string& query); virtual VARP gen_attention_mask(int seq_len); virtual VARP gen_position_ids(int seq_len); virtual bool is_stop(int token_id); -private: - // MNN Modules - std::shared_ptr runtime_manager_; - std::vector> modules_; - std::vector past_key_values_; - // model dir - std::string model_dir_; }; +#if 0 // some llm models class Chatglm_6b : public Llm { public: @@ -222,27 +224,12 @@ class Chatglm_6b : public Llm { key_value_shape_ = {2, 0, 1, 32, 128}; } private: - virtual std::vector tokenizer(const std::string& query) override; virtual VARP gen_attention_mask(int seq_len) override; virtual VARP gen_position_ids(int seq_len) override; virtual bool is_stop(int token_id) override; int context_len_ = 0; }; - -class Chatglm2_6b : public Llm { -public: - Chatglm2_6b() { - model_name_ = "Chatglm2_6b"; - layer_nums_ = 28; - key_value_shape_ = {2, 0, 1, 2, 128}; - } -private: - virtual std::vector tokenizer(const std::string& query) override; - virtual VARP gen_attention_mask(int seq_len) override; - virtual VARP gen_position_ids(int seq_len) override; - virtual bool is_stop(int token_id) override; -}; - +/* class Phi_2 : public Chatglm2_6b { public: Phi_2() { @@ -256,24 +243,9 @@ class Phi_2 : public Chatglm2_6b { virtual std::vector tokenizer(const std::string& query) override; virtual bool is_stop(int token_id) override; }; +*/ -class Qwen_7b : public Llm { -public: - Qwen_7b() { - model_name_ = "Qwen_7b"; - layer_nums_ = 32; - key_value_shape_ = {2, 1, 0, 32, 128}; - hidden_size_ = 4096; - tokenizer_.reset(new Tiktoken); - } -private: - virtual std::vector tokenizer(const std::string& query) override; - virtual VARP gen_attention_mask(int seq_len) override; - virtual VARP gen_position_ids(int seq_len) override; - virtual bool is_stop(int token_id) override; -}; - -class Qwen_vl : public Qwen_7b { +class Qwen_vl : public Llm { public: Qwen_vl() { model_name_ = "Qwen_vl"; @@ -292,21 +264,9 @@ class Qwen_vl : public Qwen_7b { private: std::vector url_encode(const std::string& url); virtual VARP visual_embedding(const std::vector& input_ids) override; - virtual std::vector tokenizer(const std::string& query) override; virtual VARP gen_attention_mask(int seq_len) override; }; -class Qwen_1_8b : public Qwen_7b { -public: - Qwen_1_8b() { - model_name_ = "Qwen_1.8b"; - layer_nums_ = 24; - key_value_shape_ = {2, 1, 0, 16, 128}; - hidden_size_ = 2048; - tokenizer_.reset(new Tiktoken); - } -}; - class Llama2_7b : public Llm { public: Llama2_7b() { @@ -315,74 +275,11 @@ class Llama2_7b : public Llm { key_value_shape_ = {2, 1, 32, 0, 128}; } private: - virtual std::vector tokenizer(const std::string& query) override; virtual VARP gen_attention_mask(int seq_len) override; virtual VARP gen_position_ids(int seq_len) override; virtual bool is_stop(int token_id) override; }; -class Qwen2 : public Llama2_7b { -public: - Qwen2() { - model_name_ = "Qwen2"; - tokenizer_.reset(new HuggingfaceTokenizer); - } -private: - virtual std::vector tokenizer(const std::string& query) override; - virtual bool is_stop(int token_id) override; -}; - -class Qwen2_0_5b : public Qwen2 { -public: - Qwen2_0_5b() { - model_name_ = "Qwen2_0.5b"; - layer_nums_ = 24; - key_value_shape_ = {2, 1, 16, 0, 64}; - hidden_size_ = 1024; - } -}; - -class Qwen2_1_8b : public Qwen2 { -public: - Qwen2_1_8b() { - model_name_ = "Qwen2_1.8b"; - layer_nums_ = 24; - key_value_shape_ = {2, 1, 16, 0, 128}; - hidden_size_ = 2048; - } -}; - -class Qwen2_4b : public Qwen2 { -public: - Qwen2_4b() { - model_name_ = "Qwen2_4b"; - layer_nums_ = 40; - key_value_shape_ = {2, 1, 20, 0, 128}; - hidden_size_ = 2560; - } -}; - -class Qwen2_7b : public Qwen2 { -public: - Qwen2_7b() { - model_name_ = "Qwen2_7b"; - layer_nums_ = 32; - key_value_shape_ = {2, 1, 32, 0, 128}; - hidden_size_ = 4096; - } -}; - -class TinyLlama : public Llama2_7b { -public: - TinyLlama() { - model_name_ = "TinyLlama"; - layer_nums_ = 22; - key_value_shape_ = {2, 1, 4, 0, 64}; - } -private: - virtual std::vector tokenizer(const std::string& query) override; -}; - class Yi_6b : public Llama2_7b { public: Yi_6b() { @@ -390,22 +287,9 @@ class Yi_6b : public Llama2_7b { key_value_shape_ = {2, 1, 4, 0, 128}; } private: - virtual std::vector tokenizer(const std::string& query) override; - virtual bool is_stop(int token_id) override; -}; - -class Llama3_8b : public Llama2_7b { -public: - Llama3_8b() { - model_name_ = "Llama3_8b"; - layer_nums_ = 32; - key_value_shape_ = {2, 1, 8, 0, 128}; - hidden_size_ = 4096; - } -private: - virtual std::vector tokenizer(const std::string& query) override; virtual bool is_stop(int token_id) override; }; +#endif // Llm end // Embedding start @@ -429,8 +313,6 @@ class Embedding { // time int64_t embedding_us_ = 0; int prompt_len_ = 0; -protected: - std::vector tokenizer_encode(const std::string& input_str); protected: // model configs int layer_nums_ = 0; diff --git a/include/tokenizer.hpp b/include/tokenizer.hpp index cb81cff2..16d10861 100644 --- a/include/tokenizer.hpp +++ b/include/tokenizer.hpp @@ -17,20 +17,35 @@ class Tokenizer { public: + static constexpr int MAGIC_NUMBER = 430; + enum TokenizerType { + SENTENCEPIECE = 0, + TIKTOIKEN = 1, + BERT = 2, + HUGGINGFACE = 3 + }; Tokenizer() = default; virtual ~Tokenizer() = default; - static Tokenizer* createTokenizer(const std::string& type); - virtual bool load(const std::string& filename) = 0; - virtual std::vector encode(const std::string& str) = 0; + static Tokenizer* createTokenizer(const std::string& filename); + bool is_stop(int token); + std::vector encode(const std::string& str); virtual std::string decode(int id) = 0; +protected: + virtual void load_special(std::ifstream& file); + virtual bool load_vocab(std::ifstream& file) = 0; + virtual void encode(const std::string& str, std::vector& ids) = 0; + std::vector special_tokens_; + std::vector stop_tokens_; + std::vector prefix_tokens_; }; class Sentencepiece : public Tokenizer { public: Sentencepiece() = default; - virtual bool load(const std::string& filename) override; - virtual std::vector encode(const std::string& str) override; virtual std::string decode(int id) override; +protected: + virtual bool load_vocab(std::ifstream& file) override; + virtual void encode(const std::string& str, std::vector& ids) override; private: enum ModelType { UNIGRAM = 1, @@ -77,10 +92,10 @@ class Sentencepiece : public Tokenizer { class Tiktoken : public Tokenizer { public: Tiktoken() = default; - virtual bool load(const std::string& filename) override; - virtual std::vector encode(const std::string& str) override; virtual std::string decode(int id) override; protected: + virtual bool load_vocab(std::ifstream& file) override; + virtual void encode(const std::string& str, std::vector& ids) override; std::unordered_map encoder_; std::vector decoder_; }; @@ -88,7 +103,8 @@ class Tiktoken : public Tokenizer { class BertTokenizer : public Tiktoken { public: BertTokenizer() = default; - virtual std::vector encode(const std::string& str) override; +protected: + virtual void encode(const std::string& str, std::vector& ids) override; private: std::vector word_piece(const std::string& token); }; @@ -105,9 +121,10 @@ struct hash_pair_wstring { using BPERanks = std::unordered_map, int, hash_pair_wstring>; public: HuggingfaceTokenizer() = default; - virtual bool load(const std::string& filename) override; - virtual std::vector encode(const std::string& str) override; virtual std::string decode(int id) override; +protected: + virtual bool load_vocab(std::ifstream& file) override; + virtual void encode(const std::string& str, std::vector& ids) override; private: void bpe(const std::wstring& token, const BPERanks& bpe_ranks, std::vector* result); BPERanks bpe_ranks_; diff --git a/src/llm.cpp b/src/llm.cpp index e3cfc31c..6d19bcbf 100644 --- a/src/llm.cpp +++ b/src/llm.cpp @@ -23,53 +23,116 @@ // Llm start Llm* Llm::createLLM(const std::string& model_dir, std::string model_type) { - Llm* llm = new Llm; - llm->config_ = LlmConfig(model_dir); - llm->tokenizer_.reset(Tokenizer::createTokenizer(llm->config_.tokenizer_type())); - // llm->load(); - llm->key_value_shape_ = llm->config_.key_value_shape(); - llm->layer_nums_ = 24; + std::shared_ptr config(new LlmConfig(model_dir)); + Llm* llm = new Llm(config); return llm; } void Llm::load() { + // init module status + key_value_shape_ = config_->key_value_shape(); + layer_nums_ = config_->layer_nums(); + is_single_ = config_->is_single(); + { + std::ifstream embedding_bin(config_->embedding_file()); + is_disk_embedding_ = embedding_bin.good(); + embedding_bin.close(); + } + MNN_PRINT("### is_single_ = %d, is_disk_embedding_ = %d\n", is_single_, is_disk_embedding_); // init runtime ScheduleConfig config; BackendConfig cpuBackendConfig; config.type = static_cast(backend_type_);; - config.numThread = config_.thread_num(); + config.numThread = config_->thread_num(); cpuBackendConfig.precision = BackendConfig::Precision_Low; cpuBackendConfig.memory = BackendConfig::Memory_Low; config.backendConfig = &cpuBackendConfig; runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config)); runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0); - load_progress_ = 0.f; - printf("load tokenizer\n"); // 1. load vocab - tokenizer_->load(config_.tokenizer_file()); - printf("load tokenizer Done\n"); - { - std::ifstream embedding_bin(config_.embedding_file()); - is_disk_embedding_ = embedding_bin.good(); - MNN_PRINT("### disk embedding is %d\n", is_disk_embedding_); - embedding_bin.close(); - } - // 2. load model + MNN_PRINT("load tokenizer\n"); + tokenizer_.reset(Tokenizer::createTokenizer(config_->tokenizer_file())); + MNN_PRINT("load tokenizer Done\n"); + // 3. load model Module::Config module_config; module_config.shapeMutable = true; module_config.rearrange = true; + if (is_single_) { + // load single model + key_value_shape_.insert(key_value_shape_.begin(), layer_nums_); + modules_.resize(1); + std::string model_path = config_->llm_model(); + MNN_PRINT("load %s ... ", model_path.c_str()); + runtime_manager_->setExternalFile(config_->llm_weight()); + modules_[0].reset(Module::load( + {"input_ids", "attention_mask", "position_ids", "past_key_values"}, + {"token_id", "presents"}, model_path.c_str(), runtime_manager_, &module_config)); + MNN_PRINT("Done!\n"); + } else { + // load split models + modules_.resize(layer_nums_ + 2); + // load lm model + modules_[layer_nums_].reset(Module::load({}, {}, config_->lm_model().c_str(), runtime_manager_, &module_config)); + if (!is_disk_embedding_) { + modules_[layer_nums_ + 1].reset(Module::load({}, {}, config_->embedding_model().c_str(), runtime_manager_, &module_config)); + } + // load block models + for (int i = 0; i < layer_nums_; i++) { + std::string model_path = config_->block_model(i); + MNN_PRINT("load %s ... ", model_path.c_str()); + modules_[i].reset(Module::load( + {"inputs_embeds", "attention_mask", "position_ids", "past_key_values"}, + {"hidden_states", "presents"}, model_path.c_str(), runtime_manager_, &module_config)); + MNN_PRINT("Done!\n"); + } + } +} - key_value_shape_.insert(key_value_shape_.begin(), layer_nums_); - modules_.resize(1); - std::string model_path = config_.llm_model(); - std::string external_path = config_.llm_weight(); - MNN_PRINT("load %s ... ", model_path.c_str()); - runtime_manager_->setExternalFile(external_path); - modules_[0].reset(Module::load( - {"input_ids", "attention_mask", "position_ids", "past_key_values"}, - {"token_id", "presents"}, model_path.c_str(), runtime_manager_, &module_config)); - MNN_PRINT("Done!\n"); - load_progress_ += 90.f; +int Llm::forward(const std::vector& input_ids) { + int seq_len = input_ids.size(); + auto attention_mask = gen_attention_mask(seq_len); + auto position_ids = gen_position_ids(seq_len); + int id = -1; + if (is_single_) { + // single model + auto hidden_states = _Const(input_ids.data(), {seq_len}, NCHW, halide_type_of()); + if (is_disk_embedding_) { + hidden_states = embedding(input_ids); + } + auto outputs = modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]}); + ExecutorScope::Current()->gc(Executor::FULL); + id = outputs[0]->readMap()[0]; + past_key_values_[0] = outputs[1]; + } else { + // split block models + auto hidden_states = embedding(input_ids); + ExecutorScope::Current()->gc(Executor::FULL); + for (int i = 0; i < layer_nums_; i++) { + AUTOTIME; + auto outputs = modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]}); + hidden_states = outputs[0]; + past_key_values_[i] = outputs[1]; + } + ExecutorScope::Current()->gc(Executor::FULL); + { + AUTOTIME; + auto outputs = modules_[layer_nums_]->onForward({hidden_states}); + id = outputs[0]->readMap()[0]; + } + } + all_seq_len_ += seq_len; + gen_seq_len_++; + return id; +} + +std::string Llm::apply_chat_template(const std::string& input_str) const { + auto prompt = config_->prompt_template(); + if (prompt.empty()) return input_str; + const std::string placeholder = "%s"; + size_t start_pos = prompt.find(placeholder); + if (start_pos == std::string::npos) return input_str; + prompt.replace(start_pos, placeholder.length(), input_str); + return prompt; } void Llm::chat() { @@ -81,7 +144,7 @@ void Llm::chat() { break; } if (input_str == "/reset") { - reset(); + // reset(); std::cout << "\nA: reset done." << std::endl; continue; } @@ -89,7 +152,6 @@ void Llm::chat() { response(input_str); std::cout << std::endl; } - reset(); } void Llm::generate_init() { @@ -108,19 +170,18 @@ void Llm::generate_init() { } } -std::vector Llm::generate(const std::vector& input_ids) { +std::vector Llm::generate(const std::vector& input_ids, int max_new_tokens) { generate_init(); std::vector output_ids; prompt_len_ = static_cast(input_ids.size()); + if (max_new_tokens < 0) { max_new_tokens = config_->max_new_tokens(); } // prefill int token = forward(input_ids); output_ids.push_back(token); // decode - while (gen_seq_len_ < max_new_tokens_) { + while (gen_seq_len_ < max_new_tokens) { token = forward({token}); - if (is_stop(token)) { - break; - } + if (is_stop(token)) { break; } output_ids.push_back(token); } return output_ids; @@ -131,11 +192,10 @@ std::string Llm::generate(const std::vector& input_ids, std::ostream* os, c auto st = std::chrono::system_clock::now(); int token = forward(input_ids); auto et = std::chrono::system_clock::now(); - history_.push_back(token); std::string output_str = decode(token); prefill_us_ = std::chrono::duration_cast(et - st).count(); *os << output_str << std::flush; - while (gen_seq_len_ < max_new_tokens_) { + while (gen_seq_len_ < config_->max_new_tokens()) { st = std::chrono::system_clock::now(); token = forward({token}); et = std::chrono::system_clock::now(); @@ -144,7 +204,6 @@ std::string Llm::generate(const std::vector& input_ids, std::ostream* os, c *os << end_with << std::flush; break; } - history_.push_back(token); auto word = decode(token); *os << word << std::flush; output_str += word; @@ -157,28 +216,9 @@ std::string Llm::generate(const std::vector& input_ids, std::ostream* os, c std::string Llm::response(const std::string& query, std::ostream* os, const char* end_with) { generate_init(); - if (!end_with) { - end_with = "\n"; - } - // response - auto input_ids = chatml_ ? tokenizer(query) : tokenizer_encode(query); - // printf("ids = "); for (int id : input_ids) printf("%d, ", id); printf("\n"); - if (!history_.empty()) { - std::copy(input_ids.begin(), input_ids.end(), std::back_inserter(history_)); - input_ids = history_; - } else { - history_ = input_ids; - } - return generate(input_ids, os, end_with); -} - -std::string Llm::response_nohistory(const std::string& query, std::ostream* os, const char* end_with) { - generate_init(); - if (!end_with) { - end_with = "\n"; - } - // response - auto input_ids = chatml_ ? tokenizer(query) : tokenizer_encode(query); + if (!end_with) { end_with = "\n"; } + auto prompt = apply_chat_template(query); + auto input_ids = tokenizer_->encode(prompt); return generate(input_ids, os, end_with); } @@ -200,64 +240,6 @@ void Llm::print_speed() { printf("##################################\n"); } -void Llm::reset() { - history_.clear(); -} - -void Llm::warmup() { - // warmup - MNN_PRINT("### warmup ... "); - if (is_single_) { - past_key_values_.push_back(_Input(key_value_shape_, NCHW)); - } else { - for (int i = 0; i < layer_nums_; i++) { - past_key_values_.push_back(_Input(key_value_shape_, NCHW)); - } - } - std::vector tmp(1, 0); - forward(tmp); - all_seq_len_ = 0; - gen_seq_len_ = 0; - printf("Done\n"); -} - -int Llm::forward(const std::vector& input_ids) { - int seq_len = input_ids.size(); - auto attention_mask = gen_attention_mask(seq_len); - auto position_ids = gen_position_ids(seq_len); - int id = -1; - if (is_single_) { - // single model - auto hidden_states = _Const(input_ids.data(), {seq_len}, NCHW, halide_type_of()); - if (is_disk_embedding_) { - hidden_states = embedding(input_ids); - } - auto outputs = modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]}); - ExecutorScope::Current()->gc(Executor::FULL); - id = outputs[0]->readMap()[0]; - past_key_values_[0] = outputs[1]; - } else { - // split block models - auto hidden_states = embedding(input_ids); - ExecutorScope::Current()->gc(Executor::FULL); - for (int i = 0; i < layer_nums_; i++) { - AUTOTIME; - auto outputs = modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]}); - hidden_states = outputs[0]; - past_key_values_[i] = outputs[1]; - } - ExecutorScope::Current()->gc(Executor::FULL); - { - AUTOTIME; - auto outputs = modules_[layer_nums_]->onForward({hidden_states}); - id = outputs[0]->readMap()[0]; - } - } - all_seq_len_ += seq_len; - gen_seq_len_++; - return id; -} - static inline bool needNewVar(VARP var, int axis, int seq_len) { if (var == nullptr) { return true; @@ -277,14 +259,14 @@ VARP Llm::txt_embedding(const std::vector& input_ids) { } AUTOTIME; // disk embedding to save memory - int hidden_size = config_.hidden_size(); + int hidden_size = config_->hidden_size(); int seq_len = static_cast(input_ids.size()); if (needNewVar(inputs_embeds_, 0, seq_len)) { inputs_embeds_ = _Input({seq_len, 1, hidden_size}, NCHW); } size_t size = hidden_size * sizeof(int16_t); - FILE* file = fopen(config_.embedding_file().c_str(), "rb"); + FILE* file = fopen(config_->embedding_file().c_str(), "rb"); std::unique_ptr buffer(new int16_t[hidden_size]); for (size_t i = 0; i < seq_len; i++) { fseek(file, input_ids[i] * size, SEEK_SET); @@ -306,11 +288,6 @@ VARP Llm::embedding(const std::vector& input_ids) { return txt_embedding(input_ids); } -std::vector Llm::tokenizer_encode(const std::string& input_str) { - auto ids = tokenizer_->encode(input_str); - return ids; -} - std::string Llm::decode(int id) { std::string word = tokenizer_->decode(id); // Fix utf-8 garbled characters @@ -321,37 +298,45 @@ std::string Llm::decode(int id) { return word; } -// Llm -std::vector Llm::tokenizer(const std::string& query) { - auto ids = tokenizer_encode(query); - // auto prompt = "\n<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n"; - ids.insert(ids.begin(), {198, 151644, 872, 198}); - ids.insert(ids.end(), {151645, 198, 151644, 77091, 198}); - return ids; -} - VARP Llm::gen_attention_mask(int seq_len) { - if (needNewVar(attention_mask_, 2, seq_len)) { - attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of()); - } else { + if (config_->attention_mask() == "float") { + if (needNewVar(attention_mask_, 2, seq_len)) { + attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of()); + } else { + return attention_mask_; + } + auto ptr = attention_mask_->writeMap(); + for (int i = 0; i < seq_len; i++) { + for (int j = 0; j < seq_len; j++) { + ptr[seq_len * i + j] = (j > i) * std::numeric_limits::lowest(); + } + } return attention_mask_; - } - auto ptr = attention_mask_->writeMap(); - for (int i = 0; i < seq_len; i++) { - for (int j = 0; j < seq_len; j++) { - ptr[seq_len * i + j] = j <= i; + } else { + bool is_glm = config_->attention_mask() == "glm"; + if (needNewVar(attention_mask_, 2, seq_len)) { + attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of()); + } else { + return attention_mask_; } + auto ptr = attention_mask_->writeMap(); + for (int i = 0; i < seq_len; i++) { + for (int j = 0; j < seq_len; j++) { + ptr[seq_len * i + j] = is_glm ? j > i : j <= i; + } + } + return attention_mask_; } - return attention_mask_; } VARP Llm::gen_position_ids(int seq_len) { - if (needNewVar(position_ids_, 0, seq_len) || 0) { + bool is_glm = config_->attention_mask() == "glm"; + if (needNewVar(position_ids_, 0, seq_len)) { position_ids_ = _Input({seq_len}, NCHW, halide_type_of()); } auto ptr = position_ids_->writeMap(); if (seq_len == 1) { - ptr[0] = all_seq_len_; + ptr[0] = is_glm ? gen_seq_len_ : all_seq_len_; } else { for (int i = 0; i < seq_len; i++) { ptr[i] = i; @@ -361,8 +346,7 @@ VARP Llm::gen_position_ids(int seq_len) { } bool Llm::is_stop(int token_id) { - // <|endoftext|> <|im_end|> - return token_id == 151643 || token_id == 151645; + return tokenizer_->is_stop(token_id); } #if 0 @@ -409,49 +393,6 @@ bool Chatglm_6b::is_stop(int token_id) { return token_id == 130005; } -// Chatglm2_6b -std::vector Chatglm2_6b::tokenizer(const std::string& query) { - auto prompt = "问:" + query + "\n答:"; - auto ids = tokenizer_encode(prompt); - if (history_.empty()) { - ids.insert(ids.begin(), 64792); - ids.insert(ids.begin(), 64790); - } - return ids; -} - -VARP Chatglm2_6b::gen_attention_mask(int seq_len) { - auto attention_mask = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of()); - auto ptr = attention_mask->writeMap(); - if (seq_len > 1) { - for (int i = 0; i < seq_len; i++) { - for (int j = 0; j < seq_len; j++) { - ptr[seq_len * i + j] = j > i; - } - } - } else { - ptr[0] = 0; - } - return attention_mask; -} - -VARP Chatglm2_6b::gen_position_ids(int seq_len) { - auto position_ids = _Input({seq_len}, NCHW, halide_type_of()); - auto ptr = position_ids->writeMap(); - if (seq_len == 1) { - ptr[0] = gen_seq_len_; - } else { - for (int i = 0; i < seq_len; i++) { - ptr[i] = i; - } - } - return position_ids; -} - -bool Chatglm2_6b::is_stop(int token_id) { - return token_id <= 2; -} - // Phi_2 std::vector Phi_2::tokenizer(const std::string& query) { auto prompt = query; @@ -463,50 +404,6 @@ bool Phi_2::is_stop(int token_id) { return token_id == 50256; } -// Qwen_7b -std::vector Qwen_7b::tokenizer(const std::string& query) { - auto ids = tokenizer_encode(query); - // auto prompt = "\n<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n"; - ids.insert(ids.begin(), {198, 151644, 872, 198}); - ids.insert(ids.end(), {151645, 198, 151644, 77091, 198}); - return ids; -} - -VARP Qwen_7b::gen_attention_mask(int seq_len) { - if (needNewVar(attention_mask_, 2, seq_len)) { - attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of()); - } else { - return attention_mask_; - } - auto ptr = attention_mask_->writeMap(); - for (int i = 0; i < seq_len; i++) { - for (int j = 0; j < seq_len; j++) { - ptr[seq_len * i + j] = j <= i; - } - } - return attention_mask_; -} - -VARP Qwen_7b::gen_position_ids(int seq_len) { - if (needNewVar(position_ids_, 0, seq_len) || 0) { - position_ids_ = _Input({seq_len}, NCHW, halide_type_of()); - } - auto ptr = position_ids_->writeMap(); - if (seq_len == 1) { - ptr[0] = all_seq_len_; - } else { - for (int i = 0; i < seq_len; i++) { - ptr[i] = i; - } - } - return position_ids_; -} - -bool Qwen_7b::is_stop(int token_id) { - // <|endoftext|> <|im_end|> - return token_id == 151643 || token_id == 151645; -} - // Qwen_vl std::vector Qwen_vl::url_encode(const std::string& url) { std::vector ascii_values(imgpad_len_, img_pad_); @@ -655,7 +552,7 @@ std::vector Llama2_7b::tokenizer(const std::string& query) { ids.insert(ids.end(), {185, 185, 77398, 25}); return ids; } - // llama2: [INST]{query}[/INST]: 1, 5539, 25580, 29962, query, 12452, 25580, 29962 + // llama2: [INST]{query}[/INST]: 1, 5539, 25580, 29962, query, 12452, 25580, 29962 ids.insert(ids.begin(), {1, 5539, 25580, 29962}); ids.insert(ids.end(), {12452, 25580, 29962}); return ids; @@ -705,34 +602,6 @@ bool Llama2_7b::is_stop(int token_id) { return token_id == 2; } -std::vector Qwen2::tokenizer(const std::string& query) { - auto ids = tokenizer_encode(query); - // auto prompt = "<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n"; - ids.insert(ids.begin(), {151644, 872, 198}); - ids.insert(ids.end(), {151645, 198, 151644, 77091, 198}); - return ids; -} - -bool Qwen2::is_stop(int token_id) { - return token_id == 151645 || token_id == 151643; -} - -std::vector TinyLlama::tokenizer(const std::string& query) { - auto ids = tokenizer_encode(query); - /* - <|system|> - You are a friendly chatbot who always responds in the style of a pirate - <|user|> - {query} - <|assistant|> - */ - ids.insert(ids.begin(), {1, 529, 29989, 5205, 29989, 29958, 13, 3492, 526, 263, 19780, 13563, - 7451, 1058, 2337, 10049, 29879, 297, 278, 3114, 310, 263, 21625, - 403, 2, 29871, 13, 29966, 29989, 1792, 29989, 29958, 13}); - ids.insert(ids.end(), {2, 29871, 13, 29966, 29989, 465, 22137, 29989, 29958, 13}); - return ids; -} - std::vector Yi_6b::tokenizer(const std::string& query) { auto prompt = "<|im_start|> user\n" + query + "<|im_end|>\n<|im_start|> assistant\n"; auto ids = tokenizer_encode(prompt); @@ -742,18 +611,6 @@ std::vector Yi_6b::tokenizer(const std::string& query) { bool Yi_6b::is_stop(int token_id) { return token_id == 7 || token_id == 64001; } - -std::vector Llama3_8b::tokenizer(const std::string& query) { - // <|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n+query+<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n - auto ids = tokenizer_encode(query); - ids.insert(ids.begin(), {128000, 128006, 882, 128007, 271}); - ids.insert(ids.end(), {128009, 128006, 78191, 128007, 271}); - return ids; -} - -bool Llama3_8b::is_stop(int token_id) { - return token_id == 128001 || token_id == 128009; -} #endif // Llm end @@ -784,10 +641,6 @@ Embedding* Embedding::createEmbedding(const std::string& path, std::string model } void Embedding::load(const std::string& model_dir) { - if (model_dir_ == model_dir) { - return; - } - model_dir_ = model_dir; // init ScheduleConfig config; BackendConfig cpuBackendConfig; @@ -803,7 +656,6 @@ void Embedding::load(const std::string& model_dir) { size_t pos = model_dir.find_last_of("/\\"); std::string dir_path = (pos != std::string::npos) ? model_dir.substr(0, pos + 1) : ""; std::string tokenizer_path = dir_path + "/tokenizer.txt"; - tokenizer_->load(tokenizer_path); printf("load tokenizer Done\n"); // 2. load model Module::Config module_config; @@ -841,17 +693,12 @@ void Embedding::print_speed() { printf("##################################\n"); } -std::vector Embedding::tokenizer_encode(const std::string& input_str) { - auto ids = tokenizer_->encode(input_str); - return ids; -} - std::vector Bge::tokenizer(const std::string& query) { auto prompt = query; if (query.size() <= 256) { prompt = "为这个句子生成表示以用于检索相关文章:" + query; } - auto ids = tokenizer_encode(prompt); + auto ids = tokenizer_->encode(prompt); ids.insert(ids.begin(), 101); ids.push_back(102); return ids; @@ -1191,12 +1038,12 @@ void ChatMemory::summarize(std::shared_ptr llm) { auto chat_str = content.dump(); if (!summary.contains(date)) { auto summary_prompt = "请总结以下的对话内容,尽可能精炼,提取对话的主题和关键信息。如果有多个关键事件,可以分点总结。对话内容:\n" + chat_str + "\n总结:"; - auto sum = llm->response_nohistory(summary_prompt); + auto sum = llm->response(summary_prompt); summary[date] = sum; } if (!personality.contains(date)) { auto personality_prompt = "请根据以下的对话推测总结" + user + "的性格特点和心情,并根据你的推测制定回复策略。对话内容:\n" + chat_str + "\n总结:"; - auto pers = llm->response_nohistory(personality_prompt); + auto pers = llm->response(personality_prompt); personality[date] = pers; } } @@ -1262,7 +1109,7 @@ void Pipeline::invoke(const std::string& str) { auto prompt = build_prompt(str); std::cout << prompt; if (llm_) { - auto res = llm_->response_nohistory(prompt); + auto res = llm_->response(prompt); Prompt assistant_prompt {PROMPT_TYPE::ASSISTANT, res, {}}; prompts_.emplace_back(std::move(assistant_prompt)); } diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 641eaa2e..72e80c2f 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -78,34 +78,128 @@ static inline void to_lower_case(std::string& str) { } } -Tokenizer* Tokenizer::createTokenizer(const std::string& type) { - if (type == "sentencepiece") { - return new Sentencepiece(); +Tokenizer* Tokenizer::createTokenizer(const std::string& filename) { + Tokenizer* tokenizer = nullptr; + // check file + std::ifstream tok_file(filename); + if (!tok_file.good()) { + printf("Failed: can't load tokenzier from: %s.\n", filename.c_str()); + return tokenizer; } - if (type == "tiktoken") { - return new Tiktoken(); + // check tokenizer info + std::string line; + std::getline(tok_file, line); + std::istringstream line_str(line); + int magic_number, tokenizer_type; + line_str >> magic_number; + if (magic_number != MAGIC_NUMBER) { + printf("Failed: magic number is wrong from: %s.\n", filename.c_str()); + return tokenizer; + } + line_str >> tokenizer_type; + printf("tokenizer_type = %d\n", tokenizer_type); + // create tokenizer + switch (tokenizer_type) + { + case SENTENCEPIECE: + tokenizer = new Sentencepiece(); + break; + case TIKTOIKEN: + tokenizer = new Tiktoken(); + break; + case BERT: + tokenizer = new BertTokenizer(); + break; + case HUGGINGFACE: + tokenizer = new HuggingfaceTokenizer(); + break; + default: + return tokenizer; } - if (type == "bert") { - return new BertTokenizer(); + // load special tokens + tokenizer->load_special(tok_file); + // load vocabs + tokenizer->load_vocab(tok_file); + tok_file.close(); + return tokenizer; +} + +bool Tokenizer::is_stop(int token) { + return std::find(stop_tokens_.begin(), stop_tokens_.end(), token) != stop_tokens_.end(); +} + +void Tokenizer::load_special(std::ifstream& tok_file) { + std::string line; + std::getline(tok_file, line); + std::istringstream line_str(line); + int special_num, stop_num, prefix_num; + line_str >> special_num >> stop_num >> prefix_num; + std::getline(tok_file, line); + std::istringstream specail_line(line); + if (special_num) { + // load special tokens + special_tokens_.resize(special_num); + for (int i = 0; i < special_num; i++) { + specail_line >> special_tokens_[i]; + } } - if (type == "huggingface") { - return new HuggingfaceTokenizer(); + if (stop_num) { + // load stop tokens + stop_tokens_.resize(stop_num); + for (int i = 0; i < stop_num; i++) { + specail_line >> stop_tokens_[i]; + } + } + if (prefix_num) { + // load prefix tokens + prefix_tokens_.resize(prefix_num); + for (int i = 0; i < prefix_num; i++) { + specail_line >> prefix_tokens_[i]; + } } - return nullptr; } -bool Sentencepiece::load(const std::string& filename) { - std::ifstream tok_file(filename); +std::vector Tokenizer::encode(const std::string& str) { + std::vector ids = prefix_tokens_; + if (!special_tokens_.empty()) { + std::string text = str; + while (true) { + bool contain_special = false; + for (auto special_id : special_tokens_) { + const auto& token = decode(special_id); + if (token.empty()) continue; + auto pos = text.find(token); + if (pos != std::string::npos) { + contain_special = true; + if (pos > 0) encode(text.substr(0, pos), ids); + ids.push_back(special_id); + text = text.substr(pos + token.size(), -1); + } + } + if (!contain_special) break; + } + encode(text, ids); + } else { + encode(str, ids); + } + return ids; +} + +bool Sentencepiece::load_vocab(std::ifstream& tok_file) { std::string line, token; + std::getline(tok_file, line); + int vocab_len = std::stoi(line); float score; - int index = 0, type; - while (std::getline(tok_file, line)) { + int type; + sentence_pieces_.resize(vocab_len); + for (int index = 0; index < vocab_len; index++) { + std::getline(tok_file, line); std::istringstream line_str(line); line_str >> token >> score >> type; token = base64_decode(token); auto piece_type = static_cast(type); SentencePiece piece {token, score, piece_type}; - sentence_pieces_.emplace_back(std::move(piece)); + sentence_pieces_[index] = std::move(piece); if (piece_type == PieceType::NORMAL) { pieces_.insert({token, index}); } else { @@ -114,9 +208,7 @@ bool Sentencepiece::load(const std::string& filename) { unk_id_ = index; } } - index++; } - tok_file.close(); return true; } @@ -286,8 +378,7 @@ Sentencepiece::EncodeResult Sentencepiece::bpe_encode(std::string_view normalize return output; } -std::vector Sentencepiece::encode(const std::string& str) { - std::vector ids; +void Sentencepiece::encode(const std::string& str, std::vector& ids) { auto result = bpe_encode(str); size_t consumed = 0; for (const auto &p : result) { @@ -307,7 +398,6 @@ std::vector Sentencepiece::encode(const std::string& str) { ids.push_back(id); } } - return ids; } std::string Sentencepiece::decode(int id) { @@ -331,26 +421,24 @@ bool Sentencepiece::is_control(int id) const { return sentence_pieces_[id].type == PieceType::CONTROL; } -bool Tiktoken::load(const std::string& filename) { - std::ifstream tok_file(filename); - if (!tok_file.good()) { - printf("Failed: can't load tokenzier from: %s.\n", filename.c_str()); - return false; - } - std::string token; - while (tok_file >> token) { - token = base64_decode(token); - encoder_[token] = static_cast(decoder_.size()); - decoder_.push_back(token); +bool Tiktoken::load_vocab(std::ifstream& tok_file) { + std::string line; + std::getline(tok_file, line); + int vocab_len = std::stoi(line); + // load vocab + decoder_.resize(vocab_len); + for (int i = 0; i < vocab_len; i++) { + std::getline(tok_file, line); + auto token = base64_decode(line); + encoder_.insert({token, i}); + decoder_[i] = token; } - tok_file.close(); return true; } -std::vector Tiktoken::encode(const std::string& str) { - std::vector ids; +void Tiktoken::encode(const std::string& str, std::vector& ids) { if (str.empty()) { - return ids; + return; } size_t i = 0; while (i < str.size()) { @@ -378,10 +466,9 @@ std::vector Tiktoken::encode(const std::string& str) { // If no matching symbol is found, this typically means an error in the encoding // or the input text contains characters that the encoder doesn't know how to handle std::cerr << "Error: No encoding found for the sequence starting at position " << i << std::endl; - return {}; + return; } } - return ids; } std::string Tiktoken::decode(int id) { @@ -425,8 +512,7 @@ std::vector BertTokenizer::word_piece(const std::string& token) { return ids; } -std::vector BertTokenizer::encode(const std::string& str) { - std::vector ids; +void BertTokenizer::encode(const std::string& str, std::vector& ids) { std::vector tokens; std::string current_token; size_t i = 0; @@ -476,7 +562,6 @@ std::vector BertTokenizer::encode(const std::string& str) { ids.push_back(id); } } - return ids; } std::wstring utf8_to_wstring(const std::string& str) { @@ -500,8 +585,7 @@ void byte_encode_token(const std::string& token, } } -bool HuggingfaceTokenizer::load(const std::string& filename) { - std::ifstream tok_file(filename); +bool HuggingfaceTokenizer::load_vocab(std::ifstream& tok_file) { std::string line, token; // get nums int vocab_len, merge_len; @@ -522,7 +606,6 @@ bool HuggingfaceTokenizer::load(const std::string& filename) { bpe_ranks_.insert({{utf8_to_wstring(line.substr(0, d)), utf8_to_wstring(line.substr(d + 1))}, i}); } - tok_file.close(); // bytes_to_unicode auto _insert_range = [=](int start, int end) { for (int c = start; c <= end; c++) { @@ -617,7 +700,7 @@ void HuggingfaceTokenizer::bpe(const std::wstring& token, const BPERanks& bpe_ra } } -std::vector HuggingfaceTokenizer::encode(const std::string& str) { +void HuggingfaceTokenizer::encode(const std::string& str, std::vector& ids) { std::regex re("('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\\s\\w]+|\\s+)"); std::string input = str; std::vector result; @@ -638,21 +721,22 @@ std::vector HuggingfaceTokenizer::encode(const std::string& str) { result.push_back(wstring_to_utf8(ws)); } } - std::vector ids; for (auto s : result) { ids.push_back(encoder_.at(s)); } - return ids; } std::string HuggingfaceTokenizer::decode(int id) { + // printf("decode id = %d, %lu, %s#\n", id, decoder_.size(), decoder_.at(id).c_str()); if (id >= decoder_.size()) { return ""; } std::wstring w = utf8_to_wstring(decoder_.at(id)); std::string r; for (wchar_t c : w) { - r.push_back(char(u2b_.at(c))); + if (u2b_.find(c) != u2b_.end()) { + r.push_back(char(u2b_.at(c))); + } } return r; }