diff --git a/demo/cli_demo.cpp b/demo/cli_demo.cpp
index 450c1313..eb51a249 100644
--- a/demo/cli_demo.cpp
+++ b/demo/cli_demo.cpp
@@ -25,14 +25,12 @@ void benchmark(Llm* llm, std::string prompt_file) {
     int decode_len = 0;
     int64_t prefill_time = 0;
     int64_t decode_time = 0;
-    llm->warmup();
     for (int i = 0; i < prompts.size(); i++) {
         llm->response(prompts[i]);
         prompt_len += llm->prompt_len_;
         decode_len += llm->gen_seq_len_;
         prefill_time += llm->prefill_us_;
         decode_time += llm->decode_us_;
-        llm->reset();
     }
     float prefill_s = prefill_time / 1e6;
     float decode_s = decode_time / 1e6;
diff --git a/demo/tokenizer_demo.cpp b/demo/tokenizer_demo.cpp
index 6b13d466..56ae1601 100644
--- a/demo/tokenizer_demo.cpp
+++ b/demo/tokenizer_demo.cpp
@@ -13,20 +13,20 @@ int main(int argc, const char* argv[]) {
         return 0;
     }
     std::string tokenizer_path = argv[1];
-    std::unique_ptr<Tokenizer> tokenizer_(new Tiktoken);
-    tokenizer_->load(tokenizer_path);
+    std::unique_ptr<Tokenizer> tokenizer(Tokenizer::createTokenizer(tokenizer_path));
     const std::string system_str = "Youare a helpful assistant.";
     const std::string user_str = "Hello";
     // const std::string query = "\n<|im_start|>system\n" + system_str + "<|im_end|>\n<|im_start|>\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n";
     const std::string query = "\n<|im_start|>user\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n";
+    // const std::string query = "<|im_start|>user\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n";
     // const std::string query = system_str + "\n" + user_str;
-    auto tokens = tokenizer_->encode(query);
+    auto tokens = tokenizer->encode(query);
 
     std::string decode_str;
     printf("encode tokens = [ ");
     for (auto token : tokens) {
         printf("%d, ", token);
-        decode_str += tokenizer_->decode(token);
+        decode_str += tokenizer->decode(token);
     }
     printf("]\n");
     printf("decode str = %s\n", decode_str.c_str());
diff --git a/include/llm.hpp b/include/llm.hpp
index 95b4b59e..412d5893 100644
--- a/include/llm.hpp
+++ b/include/llm.hpp
@@ -81,10 +81,6 @@ class LlmConfig {
         return config_.value("model_type", "unknow");
     }
 
-    std::string tokenizer_type() const {
-        return config_.value("tokenizer_type", "tiktoken");
-    }
-
     std::string llm_model() const {
         return base_dir_ + config_.value("llm_model", "llm.mnn");
     }
@@ -93,6 +89,18 @@ class LlmConfig {
         return base_dir_ + config_.value("llm_weight", "llm.mnn.weight");
     }
 
+    std::string block_model(int index) const {
+        return base_dir_ + config_.value("block_model", "block_") + std::to_string(index) + ".mnn";
+    }
+
+    std::string lm_model() const {
+        return base_dir_ + config_.value("lm_model", "lm.mnn");
+    }
+
+    std::string embedding_model() const {
+        return base_dir_ + config_.value("embedding_model", "embedding.mnn");
+    }
+
     std::string embedding_file() const {
         return base_dir_ + config_.value("embedding_file", "embeddings_bf16.bin");
     }
@@ -101,16 +109,28 @@ class LlmConfig {
         return base_dir_ + config_.value("tokenizer_file", "tokenizer.txt");
     }
 
+    bool is_single() const {
+        return config_.value("is_single", true);
+    }
+
+    int max_new_tokens() const {
+        return config_.value("max_new_tokens", 512);
+    }
+
     int hidden_size() const {
         return config_.value("hidden_size", 4096);
     }
 
+    int layer_nums() const {
+        return config_.value("layer_nums", 32);
+    }
+
     std::vector<int> key_value_shape() const {
         return config_.value("key_value_shape", std::vector<int>{});
     }
 
-    std::vector<int> stop_ids() const {
-        return config_.value("stop_ids", std::vector<int>{});
+    std::string attention_mask() const {
+        return config_.value("attention_mask", "int");
     }
 
     std::string prompt_template() const {
@@ -139,10 +159,7 @@ class LlmConfig {
 
 class Llm {
 public:
-    Llm() {
-        // default tokenier is senrencepiece
-        tokenizer_.reset(new Sentencepiece);
-    }
+    Llm(std::shared_ptr<LlmConfig> config) : config_(config) {}
     virtual ~Llm() {
         modules_.clear();
         visual_module_.reset();
@@ -151,26 +168,25 @@ class Llm {
     static Llm* createLLM(const std::string& path, std::string model_type = "auto");
     void load();
     void chat();
-    void warmup();
+    int forward(const std::vector<int>& input_ids);
+    std::string apply_chat_template(const std::string& input_str) const;
     std::string response(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr);
-    std::string response_nohistory(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr);
     void generate_init();
     std::string generate(const std::vector<int>& input_ids, std::ostream* os, const char* end_with);
-    std::vector<int> generate(const std::vector<int>& input_ids);
-    int forward(const std::vector<int>& input_ids);
-    float load_progress() { return load_progress_; }
-    void reset();
+    std::vector<int> generate(const std::vector<int>& input_ids, int max_new_tokens = -1);
     void print_speed();
     friend class Pipeline;
 public:
-    std::vector<int> history_;
+    // TODO
     std::string model_name_ = "";
+    bool is_single_ = true;
+    bool is_disk_embedding_ = true;
+    bool is_visual_ = false;
+    int layer_nums_ = 0;
+    int hidden_size_ = 4096;
     // config
     int max_new_tokens_ = 1024;
     int backend_type_ = 0;
-    int thread_num_ = 4;
-    bool low_precision_ = true;
-    bool chatml_ = true;
     // forward info
     int prompt_len_ = 0;
     int gen_seq_len_ = 0;
@@ -178,41 +194,27 @@ class Llm {
     // time
     int64_t prefill_us_ = 0;
     int64_t decode_us_ = 0;
-    LlmConfig config_;
+    std::shared_ptr<LlmConfig> config_;
+    std::unique_ptr<Tokenizer> tokenizer_;
 protected:
     VARP embedding(const std::vector<int>& input_ids);
     VARP txt_embedding(const std::vector<int>& input_ids);
-    std::vector<int> tokenizer_encode(const std::string& input_str);
     std::string decode(int id);
 protected:
-    VARP inputs_embeds_, attention_mask_, position_ids_;
-    // model configs
-    bool is_single_ = true;
-    bool is_disk_embedding_ = true;
-    bool is_visual_ = false;
-    int layer_nums_ = 0;
-    int hidden_size_ = 4096;
     std::vector<int> key_value_shape_ = {};
-    // gen info
-    float load_progress_ = 0.f;
-    // tokenizer
-    std::unique_ptr<Tokenizer> tokenizer_;
+    VARP inputs_embeds_, attention_mask_, position_ids_;
     std::shared_ptr<Module> visual_module_;
+    std::shared_ptr<Executor::RuntimeManager> runtime_manager_;
+    std::vector<std::shared_ptr<Module>> modules_;
+    std::vector<VARP> past_key_values_;
 private:
     virtual VARP visual_embedding(const std::vector<int>& input_ids) { return nullptr; }
-    virtual std::vector<int> tokenizer(const std::string& query);
     virtual VARP gen_attention_mask(int seq_len);
     virtual VARP gen_position_ids(int seq_len);
     virtual bool is_stop(int token_id);
-private:
-    // MNN Modules
-    std::shared_ptr<Executor::RuntimeManager> runtime_manager_;
-    std::vector<std::shared_ptr<Module>> modules_;
-    std::vector<VARP> past_key_values_;
-    // model dir
-    std::string model_dir_;
 };
 
+#if 0
 // some llm models
 class Chatglm_6b : public Llm {
 public:
@@ -222,27 +224,12 @@ class Chatglm_6b : public Llm {
         key_value_shape_ = {2, 0, 1, 32, 128};
     }
 private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
     virtual VARP gen_attention_mask(int seq_len) override;
     virtual VARP gen_position_ids(int seq_len) override;
     virtual bool is_stop(int token_id) override;
     int context_len_ = 0;
 };
-
-class Chatglm2_6b : public Llm {
-public:
-    Chatglm2_6b() {
-        model_name_ = "Chatglm2_6b";
-        layer_nums_ = 28;
-        key_value_shape_ = {2, 0, 1, 2, 128};
-    }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual VARP gen_attention_mask(int seq_len) override;
-    virtual VARP gen_position_ids(int seq_len) override;
-    virtual bool is_stop(int token_id) override;
-};
-
+/*
 class Phi_2 : public Chatglm2_6b {
 public:
     Phi_2() {
@@ -256,24 +243,9 @@ class Phi_2 : public Chatglm2_6b {
     virtual std::vector<int> tokenizer(const std::string& query) override;
     virtual bool is_stop(int token_id) override;
 };
+*/
 
-class Qwen_7b : public Llm {
-public:
-    Qwen_7b() {
-        model_name_ = "Qwen_7b";
-        layer_nums_ = 32;
-        key_value_shape_ = {2, 1, 0, 32, 128};
-        hidden_size_ = 4096;
-        tokenizer_.reset(new Tiktoken);
-    }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual VARP gen_attention_mask(int seq_len) override;
-    virtual VARP gen_position_ids(int seq_len) override;
-    virtual bool is_stop(int token_id) override;
-};
-
-class Qwen_vl : public Qwen_7b {
+class Qwen_vl : public Llm {
 public:
     Qwen_vl() {
         model_name_ = "Qwen_vl";
@@ -292,21 +264,9 @@ class Qwen_vl : public Qwen_7b {
 private:
     std::vector<int> url_encode(const std::string& url);
     virtual VARP visual_embedding(const std::vector<int>& input_ids) override;
-    virtual std::vector<int> tokenizer(const std::string& query) override;
     virtual VARP gen_attention_mask(int seq_len) override;
 };
 
-class Qwen_1_8b : public Qwen_7b {
-public:
-    Qwen_1_8b() {
-        model_name_ = "Qwen_1.8b";
-        layer_nums_ = 24;
-        key_value_shape_ = {2, 1, 0, 16, 128};
-        hidden_size_ = 2048;
-        tokenizer_.reset(new Tiktoken);
-    }
-};
-
 class Llama2_7b : public Llm {
 public:
     Llama2_7b() {
@@ -315,74 +275,11 @@ class Llama2_7b : public Llm {
         key_value_shape_ = {2, 1, 32, 0, 128};
     }
 private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
     virtual VARP gen_attention_mask(int seq_len) override;
     virtual VARP gen_position_ids(int seq_len) override;
     virtual bool is_stop(int token_id) override;
 };
 
-class Qwen2 : public Llama2_7b {
-public:
-    Qwen2() {
-        model_name_ = "Qwen2";
-        tokenizer_.reset(new HuggingfaceTokenizer);
-    }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual bool is_stop(int token_id) override;
-};
-
-class Qwen2_0_5b : public Qwen2 {
-public:
-    Qwen2_0_5b() {
-        model_name_ = "Qwen2_0.5b";
-        layer_nums_ = 24;
-        key_value_shape_ = {2, 1, 16, 0, 64};
-        hidden_size_ = 1024;
-    }
-};
-
-class Qwen2_1_8b : public Qwen2 {
-public:
-    Qwen2_1_8b() {
-        model_name_ = "Qwen2_1.8b";
-        layer_nums_ = 24;
-        key_value_shape_ = {2, 1, 16, 0, 128};
-        hidden_size_ = 2048;
-    }
-};
-
-class Qwen2_4b : public Qwen2 {
-public:
-    Qwen2_4b() {
-        model_name_ = "Qwen2_4b";
-        layer_nums_ = 40;
-        key_value_shape_ = {2, 1, 20, 0, 128};
-        hidden_size_ = 2560;
-    }
-};
-
-class Qwen2_7b : public Qwen2 {
-public:
-    Qwen2_7b() {
-        model_name_ = "Qwen2_7b";
-        layer_nums_ = 32;
-        key_value_shape_ = {2, 1, 32, 0, 128};
-        hidden_size_ = 4096;
-    }
-};
-
-class TinyLlama : public Llama2_7b {
-public:
-    TinyLlama() {
-        model_name_ = "TinyLlama";
-        layer_nums_ = 22;
-        key_value_shape_ = {2, 1, 4, 0, 64};
-    }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-};
-
 class Yi_6b : public Llama2_7b {
 public:
     Yi_6b() {
@@ -390,22 +287,9 @@ class Yi_6b : public Llama2_7b {
         key_value_shape_ = {2, 1, 4, 0, 128};
     }
 private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual bool is_stop(int token_id) override;
-};
-
-class Llama3_8b : public Llama2_7b {
-public:
-    Llama3_8b() {
-        model_name_ = "Llama3_8b";
-        layer_nums_ = 32;
-        key_value_shape_ = {2, 1, 8, 0, 128};
-        hidden_size_ = 4096;
-    }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
     virtual bool is_stop(int token_id) override;
 };
+#endif
 // Llm end
 
 // Embedding start
@@ -429,8 +313,6 @@ class Embedding {
     // time
     int64_t embedding_us_ = 0;
     int prompt_len_ = 0;
-protected:
-    std::vector<int> tokenizer_encode(const std::string& input_str);
 protected:
     // model configs
     int layer_nums_ = 0;
diff --git a/include/tokenizer.hpp b/include/tokenizer.hpp
index cb81cff2..16d10861 100644
--- a/include/tokenizer.hpp
+++ b/include/tokenizer.hpp
@@ -17,20 +17,35 @@
 
 class Tokenizer {
 public:
+    static constexpr int MAGIC_NUMBER = 430;
+    enum TokenizerType {
+        SENTENCEPIECE = 0,
+        TIKTOIKEN = 1,
+        BERT = 2,
+        HUGGINGFACE = 3
+    };
     Tokenizer() = default;
     virtual ~Tokenizer() = default;
-    static Tokenizer* createTokenizer(const std::string& type);
-    virtual bool load(const std::string& filename) = 0;
-    virtual std::vector<int> encode(const std::string& str) = 0;
+    static Tokenizer* createTokenizer(const std::string& filename);
+    bool is_stop(int token);
+    std::vector<int> encode(const std::string& str);
     virtual std::string decode(int id) = 0;
+protected:
+    virtual void load_special(std::ifstream& file);
+    virtual bool load_vocab(std::ifstream& file) = 0;
+    virtual void encode(const std::string& str, std::vector<int>& ids) = 0;
+    std::vector<int> special_tokens_;
+    std::vector<int> stop_tokens_;
+    std::vector<int> prefix_tokens_;
 };
 
 class Sentencepiece : public Tokenizer {
 public:
     Sentencepiece() = default;
-    virtual bool load(const std::string& filename) override;
-    virtual std::vector<int> encode(const std::string& str) override;
     virtual std::string decode(int id) override;
+protected:
+    virtual bool load_vocab(std::ifstream& file) override;
+    virtual void encode(const std::string& str, std::vector<int>& ids) override;
 private:
     enum ModelType {
         UNIGRAM = 1,
@@ -77,10 +92,10 @@ class Sentencepiece : public Tokenizer {
 class Tiktoken : public Tokenizer {
 public:
     Tiktoken() = default;
-    virtual bool load(const std::string& filename) override;
-    virtual std::vector<int> encode(const std::string& str) override;
     virtual std::string decode(int id) override;
 protected:
+    virtual bool load_vocab(std::ifstream& file) override;
+    virtual void encode(const std::string& str, std::vector<int>& ids) override;
     std::unordered_map<std::string, int> encoder_;
     std::vector<std::string> decoder_;
 };
@@ -88,7 +103,8 @@ class Tiktoken : public Tokenizer {
 class BertTokenizer : public Tiktoken {
 public:
     BertTokenizer() = default;
-    virtual std::vector<int> encode(const std::string& str) override;
+protected:
+    virtual void encode(const std::string& str, std::vector<int>& ids) override;
 private:
     std::vector<int> word_piece(const std::string& token);
 };
@@ -105,9 +121,10 @@ struct hash_pair_wstring {
 using BPERanks = std::unordered_map<std::pair<std::wstring, std::wstring>, int, hash_pair_wstring>;
 public:
     HuggingfaceTokenizer() = default;
-    virtual bool load(const std::string& filename) override;
-    virtual std::vector<int> encode(const std::string& str) override;
     virtual std::string decode(int id) override;
+protected:
+    virtual bool load_vocab(std::ifstream& file) override;
+    virtual void encode(const std::string& str, std::vector<int>& ids) override;
 private:
     void bpe(const std::wstring& token, const BPERanks& bpe_ranks, std::vector<std::wstring>* result);
     BPERanks bpe_ranks_;
diff --git a/src/llm.cpp b/src/llm.cpp
index e3cfc31c..6d19bcbf 100644
--- a/src/llm.cpp
+++ b/src/llm.cpp
@@ -23,53 +23,116 @@
 
 // Llm start
 Llm* Llm::createLLM(const std::string& model_dir, std::string model_type) {
-    Llm* llm = new Llm;
-    llm->config_ = LlmConfig(model_dir);
-    llm->tokenizer_.reset(Tokenizer::createTokenizer(llm->config_.tokenizer_type()));
-    // llm->load();
-    llm->key_value_shape_ = llm->config_.key_value_shape();
-    llm->layer_nums_ = 24;
+    std::shared_ptr<LlmConfig> config(new LlmConfig(model_dir));
+    Llm* llm = new Llm(config);
     return llm;
 }
 
 void Llm::load() {
+    // init module status
+    key_value_shape_ = config_->key_value_shape();
+    layer_nums_ = config_->layer_nums();
+    is_single_ = config_->is_single();
+    {
+        std::ifstream embedding_bin(config_->embedding_file());
+        is_disk_embedding_ = embedding_bin.good();
+        embedding_bin.close();
+    }
+    MNN_PRINT("### is_single_ = %d, is_disk_embedding_ = %d\n", is_single_, is_disk_embedding_);
     // init runtime
     ScheduleConfig config;
     BackendConfig cpuBackendConfig;
     config.type          = static_cast<MNNForwardType>(backend_type_);;
-    config.numThread     = config_.thread_num();
+    config.numThread     = config_->thread_num();
     cpuBackendConfig.precision = BackendConfig::Precision_Low;
     cpuBackendConfig.memory = BackendConfig::Memory_Low;
     config.backendConfig = &cpuBackendConfig;
     runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config));
     runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0);
-    load_progress_ = 0.f;
-    printf("load tokenizer\n");
     // 1. load vocab
-    tokenizer_->load(config_.tokenizer_file());
-    printf("load tokenizer Done\n");
-    {
-        std::ifstream embedding_bin(config_.embedding_file());
-        is_disk_embedding_ = embedding_bin.good();
-        MNN_PRINT("### disk embedding is %d\n", is_disk_embedding_);
-        embedding_bin.close();
-    }
-    // 2. load model
+    MNN_PRINT("load tokenizer\n");
+    tokenizer_.reset(Tokenizer::createTokenizer(config_->tokenizer_file()));
+    MNN_PRINT("load tokenizer Done\n");
+    // 3. load model
     Module::Config module_config;
     module_config.shapeMutable = true;
     module_config.rearrange = true;
+    if (is_single_) {
+        // load single model
+        key_value_shape_.insert(key_value_shape_.begin(), layer_nums_);
+        modules_.resize(1);
+        std::string model_path = config_->llm_model();
+        MNN_PRINT("load %s ... ", model_path.c_str());
+        runtime_manager_->setExternalFile(config_->llm_weight());
+        modules_[0].reset(Module::load(
+                {"input_ids", "attention_mask", "position_ids", "past_key_values"},
+                {"token_id", "presents"}, model_path.c_str(), runtime_manager_, &module_config));
+        MNN_PRINT("Done!\n");
+    } else {
+        // load split models
+        modules_.resize(layer_nums_ + 2);
+        // load lm model
+        modules_[layer_nums_].reset(Module::load({}, {}, config_->lm_model().c_str(), runtime_manager_, &module_config));
+        if (!is_disk_embedding_) {
+            modules_[layer_nums_ + 1].reset(Module::load({}, {}, config_->embedding_model().c_str(), runtime_manager_, &module_config));
+        }
+        // load block models
+        for (int i = 0; i < layer_nums_; i++) {
+            std::string model_path = config_->block_model(i);
+            MNN_PRINT("load %s ... ", model_path.c_str());
+            modules_[i].reset(Module::load(
+                {"inputs_embeds", "attention_mask", "position_ids", "past_key_values"},
+                {"hidden_states", "presents"}, model_path.c_str(), runtime_manager_, &module_config));
+            MNN_PRINT("Done!\n");
+        }
+    }
+}
 
-    key_value_shape_.insert(key_value_shape_.begin(), layer_nums_);
-    modules_.resize(1);
-    std::string model_path = config_.llm_model();
-    std::string external_path = config_.llm_weight();
-    MNN_PRINT("load %s ... ", model_path.c_str());
-    runtime_manager_->setExternalFile(external_path);
-    modules_[0].reset(Module::load(
-            {"input_ids", "attention_mask", "position_ids", "past_key_values"},
-            {"token_id", "presents"}, model_path.c_str(), runtime_manager_, &module_config));
-    MNN_PRINT("Done!\n");
-    load_progress_ += 90.f;
+int Llm::forward(const std::vector<int>& input_ids) {
+    int seq_len = input_ids.size();
+    auto attention_mask = gen_attention_mask(seq_len);
+    auto position_ids = gen_position_ids(seq_len);
+    int id = -1;
+    if (is_single_) {
+        // single model
+        auto hidden_states = _Const(input_ids.data(), {seq_len}, NCHW, halide_type_of<int>());
+        if (is_disk_embedding_) {
+            hidden_states = embedding(input_ids);
+        }
+        auto outputs = modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]});
+        ExecutorScope::Current()->gc(Executor::FULL);
+        id = outputs[0]->readMap<int>()[0];
+        past_key_values_[0] = outputs[1];
+    } else {
+        // split block models
+        auto hidden_states = embedding(input_ids);
+        ExecutorScope::Current()->gc(Executor::FULL);
+        for (int i = 0; i < layer_nums_; i++) {
+            AUTOTIME;
+            auto outputs = modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]});
+            hidden_states = outputs[0];
+            past_key_values_[i] = outputs[1];
+        }
+        ExecutorScope::Current()->gc(Executor::FULL);
+        {
+            AUTOTIME;
+            auto outputs = modules_[layer_nums_]->onForward({hidden_states});
+            id = outputs[0]->readMap<int>()[0];
+        }
+    }
+    all_seq_len_ += seq_len;
+    gen_seq_len_++;
+    return id;
+}
+
+std::string Llm::apply_chat_template(const std::string& input_str) const {
+    auto prompt = config_->prompt_template();
+    if (prompt.empty()) return input_str;
+    const std::string placeholder = "%s";
+    size_t start_pos = prompt.find(placeholder);
+    if (start_pos == std::string::npos) return input_str;
+    prompt.replace(start_pos, placeholder.length(), input_str);
+    return prompt;
 }
 
 void Llm::chat() {
@@ -81,7 +144,7 @@ void Llm::chat() {
             break;
         }
         if (input_str == "/reset") {
-            reset();
+            // reset();
             std::cout << "\nA: reset done." << std::endl;
             continue;
         }
@@ -89,7 +152,6 @@ void Llm::chat() {
         response(input_str);
         std::cout << std::endl;
     }
-    reset();
 }
 
 void Llm::generate_init() {
@@ -108,19 +170,18 @@ void Llm::generate_init() {
     }
 }
 
-std::vector<int> Llm::generate(const std::vector<int>& input_ids) {
+std::vector<int> Llm::generate(const std::vector<int>& input_ids, int max_new_tokens) {
     generate_init();
     std::vector<int> output_ids;
     prompt_len_ = static_cast<int>(input_ids.size());
+    if (max_new_tokens < 0) { max_new_tokens = config_->max_new_tokens(); }
     // prefill
     int token = forward(input_ids);
     output_ids.push_back(token);
     // decode
-    while (gen_seq_len_ < max_new_tokens_) {
+    while (gen_seq_len_ < max_new_tokens) {
         token = forward({token});
-        if (is_stop(token)) {
-            break;
-        }
+        if (is_stop(token)) { break; }
         output_ids.push_back(token);
     }
     return output_ids;
@@ -131,11 +192,10 @@ std::string Llm::generate(const std::vector<int>& input_ids, std::ostream* os, c
     auto st = std::chrono::system_clock::now();
     int token = forward(input_ids);
     auto et = std::chrono::system_clock::now();
-    history_.push_back(token);
     std::string output_str = decode(token);
     prefill_us_ = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
     *os << output_str << std::flush;
-    while (gen_seq_len_ < max_new_tokens_) {
+    while (gen_seq_len_ < config_->max_new_tokens()) {
         st = std::chrono::system_clock::now();
         token = forward({token});
         et = std::chrono::system_clock::now();
@@ -144,7 +204,6 @@ std::string Llm::generate(const std::vector<int>& input_ids, std::ostream* os, c
             *os << end_with << std::flush;
             break;
         }
-        history_.push_back(token);
         auto word = decode(token);
         *os << word << std::flush;
         output_str += word;
@@ -157,28 +216,9 @@ std::string Llm::generate(const std::vector<int>& input_ids, std::ostream* os, c
 
 std::string Llm::response(const std::string& query, std::ostream* os, const char* end_with) {
     generate_init();
-    if (!end_with) {
-        end_with = "\n";
-    }
-    // response
-    auto input_ids = chatml_ ? tokenizer(query) : tokenizer_encode(query);
-    // printf("ids = "); for (int id : input_ids) printf("%d, ", id); printf("\n");
-    if (!history_.empty()) {
-        std::copy(input_ids.begin(), input_ids.end(), std::back_inserter(history_));
-        input_ids = history_;
-    } else {
-        history_ = input_ids;
-    }
-    return generate(input_ids, os, end_with);
-}
-
-std::string Llm::response_nohistory(const std::string& query, std::ostream* os, const char* end_with) {
-    generate_init();
-    if (!end_with) {
-        end_with = "\n";
-    }
-    // response
-    auto input_ids = chatml_ ? tokenizer(query) : tokenizer_encode(query);
+    if (!end_with) { end_with = "\n"; }
+    auto prompt = apply_chat_template(query);
+    auto input_ids = tokenizer_->encode(prompt);
     return generate(input_ids, os, end_with);
 }
 
@@ -200,64 +240,6 @@ void Llm::print_speed() {
     printf("##################################\n");
 }
 
-void Llm::reset() {
-    history_.clear();
-}
-
-void Llm::warmup() {
-    // warmup
-    MNN_PRINT("### warmup ... ");
-    if (is_single_) {
-        past_key_values_.push_back(_Input(key_value_shape_, NCHW));
-    } else {
-        for (int i = 0; i < layer_nums_; i++) {
-            past_key_values_.push_back(_Input(key_value_shape_, NCHW));
-        }
-    }
-    std::vector<int> tmp(1, 0);
-    forward(tmp);
-    all_seq_len_ = 0;
-    gen_seq_len_ = 0;
-    printf("Done\n");
-}
-
-int Llm::forward(const std::vector<int>& input_ids) {
-    int seq_len = input_ids.size();
-    auto attention_mask = gen_attention_mask(seq_len);
-    auto position_ids = gen_position_ids(seq_len);
-    int id = -1;
-    if (is_single_) {
-        // single model
-        auto hidden_states = _Const(input_ids.data(), {seq_len}, NCHW, halide_type_of<int>());
-        if (is_disk_embedding_) {
-            hidden_states = embedding(input_ids);
-        }
-        auto outputs = modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]});
-        ExecutorScope::Current()->gc(Executor::FULL);
-        id = outputs[0]->readMap<int>()[0];
-        past_key_values_[0] = outputs[1];
-    } else {
-        // split block models
-        auto hidden_states = embedding(input_ids);
-        ExecutorScope::Current()->gc(Executor::FULL);
-        for (int i = 0; i < layer_nums_; i++) {
-            AUTOTIME;
-            auto outputs = modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]});
-            hidden_states = outputs[0];
-            past_key_values_[i] = outputs[1];
-        }
-        ExecutorScope::Current()->gc(Executor::FULL);
-        {
-            AUTOTIME;
-            auto outputs = modules_[layer_nums_]->onForward({hidden_states});
-            id = outputs[0]->readMap<int>()[0];
-        }
-    }
-    all_seq_len_ += seq_len;
-    gen_seq_len_++;
-    return id;
-}
-
 static inline bool needNewVar(VARP var, int axis, int seq_len) {
     if (var == nullptr) {
         return true;
@@ -277,14 +259,14 @@ VARP Llm::txt_embedding(const std::vector<int>& input_ids) {
     }
     AUTOTIME;
     // disk embedding to save memory
-    int hidden_size = config_.hidden_size();
+    int hidden_size = config_->hidden_size();
     int seq_len = static_cast<int>(input_ids.size());
     if (needNewVar(inputs_embeds_, 0, seq_len)) {
         inputs_embeds_ = _Input({seq_len, 1, hidden_size}, NCHW);
     }
 
     size_t size = hidden_size * sizeof(int16_t);
-    FILE* file = fopen(config_.embedding_file().c_str(), "rb");
+    FILE* file = fopen(config_->embedding_file().c_str(), "rb");
     std::unique_ptr<int16_t[]> buffer(new int16_t[hidden_size]);
     for (size_t i = 0; i < seq_len; i++) {
         fseek(file, input_ids[i] * size, SEEK_SET);
@@ -306,11 +288,6 @@ VARP Llm::embedding(const std::vector<int>& input_ids) {
     return txt_embedding(input_ids);
 }
 
-std::vector<int> Llm::tokenizer_encode(const std::string& input_str) {
-    auto ids = tokenizer_->encode(input_str);
-    return ids;
-}
-
 std::string Llm::decode(int id) {
     std::string word = tokenizer_->decode(id);
     // Fix utf-8 garbled characters
@@ -321,37 +298,45 @@ std::string Llm::decode(int id) {
     return word;
 }
 
-// Llm
-std::vector<int> Llm::tokenizer(const std::string& query) {
-    auto ids = tokenizer_encode(query);
-    // auto prompt = "\n<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n";
-    ids.insert(ids.begin(), {198, 151644, 872, 198});
-    ids.insert(ids.end(), {151645, 198, 151644, 77091, 198});
-    return ids;
-}
-
 VARP Llm::gen_attention_mask(int seq_len) {
-    if (needNewVar(attention_mask_, 2, seq_len)) {
-        attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<int>());
-    } else {
+    if (config_->attention_mask() == "float") {
+        if (needNewVar(attention_mask_, 2, seq_len)) {
+            attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<float>());
+        } else {
+            return attention_mask_;
+        }
+        auto ptr = attention_mask_->writeMap<float>();
+        for (int i = 0; i < seq_len; i++) {
+            for (int j = 0; j < seq_len; j++) {
+                ptr[seq_len * i + j] = (j > i) * std::numeric_limits<float>::lowest();
+            }
+        }
         return attention_mask_;
-    }
-    auto ptr = attention_mask_->writeMap<int>();
-    for (int i = 0; i < seq_len; i++) {
-        for (int j = 0; j < seq_len; j++) {
-            ptr[seq_len * i + j] = j <= i;
+    } else {
+        bool is_glm = config_->attention_mask() == "glm";
+        if (needNewVar(attention_mask_, 2, seq_len)) {
+            attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<int>());
+        } else {
+            return attention_mask_;
         }
+        auto ptr = attention_mask_->writeMap<int>();
+        for (int i = 0; i < seq_len; i++) {
+            for (int j = 0; j < seq_len; j++) {
+                ptr[seq_len * i + j] = is_glm ? j > i : j <= i;
+            }
+        }
+        return attention_mask_;
     }
-    return attention_mask_;
 }
 
 VARP Llm::gen_position_ids(int seq_len) {
-    if (needNewVar(position_ids_, 0, seq_len) || 0) {
+    bool is_glm = config_->attention_mask() == "glm";
+    if (needNewVar(position_ids_, 0, seq_len)) {
         position_ids_ = _Input({seq_len}, NCHW, halide_type_of<int>());
     }
     auto ptr = position_ids_->writeMap<int>();
     if (seq_len == 1) {
-        ptr[0] = all_seq_len_;
+        ptr[0] = is_glm ? gen_seq_len_ : all_seq_len_;
     } else {
         for (int i = 0; i < seq_len; i++) {
             ptr[i] = i;
@@ -361,8 +346,7 @@ VARP Llm::gen_position_ids(int seq_len) {
 }
 
 bool Llm::is_stop(int token_id) {
-    // <|endoftext|>  <|im_end|>
-    return token_id == 151643 || token_id == 151645;
+    return tokenizer_->is_stop(token_id);
 }
 
 #if 0
@@ -409,49 +393,6 @@ bool Chatglm_6b::is_stop(int token_id) {
     return token_id == 130005;
 }
 
-// Chatglm2_6b
-std::vector<int> Chatglm2_6b::tokenizer(const std::string& query) {
-    auto prompt = "问：" + query + "\n答：";
-    auto ids = tokenizer_encode(prompt);
-    if (history_.empty()) {
-        ids.insert(ids.begin(), 64792);
-        ids.insert(ids.begin(), 64790);
-    }
-    return ids;
-}
-
-VARP Chatglm2_6b::gen_attention_mask(int seq_len) {
-    auto attention_mask = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<int>());
-    auto ptr = attention_mask->writeMap<int>();
-    if (seq_len > 1) {
-        for (int i = 0; i < seq_len; i++) {
-            for (int j = 0; j < seq_len; j++) {
-                ptr[seq_len * i + j] = j > i;
-            }
-        }
-    } else {
-        ptr[0] = 0;
-    }
-    return attention_mask;
-}
-
-VARP Chatglm2_6b::gen_position_ids(int seq_len) {
-    auto position_ids = _Input({seq_len}, NCHW, halide_type_of<int>());
-    auto ptr = position_ids->writeMap<int>();
-    if (seq_len == 1) {
-        ptr[0] = gen_seq_len_;
-    } else {
-        for (int i = 0; i < seq_len; i++) {
-            ptr[i] = i;
-        }
-    }
-    return position_ids;
-}
-
-bool Chatglm2_6b::is_stop(int token_id) {
-    return token_id <= 2;
-}
-
 // Phi_2
 std::vector<int> Phi_2::tokenizer(const std::string& query) {
     auto prompt = query;
@@ -463,50 +404,6 @@ bool Phi_2::is_stop(int token_id) {
     return token_id == 50256;
 }
 
-// Qwen_7b
-std::vector<int> Qwen_7b::tokenizer(const std::string& query) {
-    auto ids = tokenizer_encode(query);
-    // auto prompt = "\n<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n";
-    ids.insert(ids.begin(), {198, 151644, 872, 198});
-    ids.insert(ids.end(), {151645, 198, 151644, 77091, 198});
-    return ids;
-}
-
-VARP Qwen_7b::gen_attention_mask(int seq_len) {
-    if (needNewVar(attention_mask_, 2, seq_len)) {
-        attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<int>());
-    } else {
-        return attention_mask_;
-    }
-    auto ptr = attention_mask_->writeMap<int>();
-    for (int i = 0; i < seq_len; i++) {
-        for (int j = 0; j < seq_len; j++) {
-            ptr[seq_len * i + j] = j <= i;
-        }
-    }
-    return attention_mask_;
-}
-
-VARP Qwen_7b::gen_position_ids(int seq_len) {
-    if (needNewVar(position_ids_, 0, seq_len) || 0) {
-        position_ids_ = _Input({seq_len}, NCHW, halide_type_of<int>());
-    }
-    auto ptr = position_ids_->writeMap<int>();
-    if (seq_len == 1) {
-        ptr[0] = all_seq_len_;
-    } else {
-        for (int i = 0; i < seq_len; i++) {
-            ptr[i] = i;
-        }
-    }
-    return position_ids_;
-}
-
-bool Qwen_7b::is_stop(int token_id) {
-    // <|endoftext|>  <|im_end|>
-    return token_id == 151643 || token_id == 151645;
-}
-
 // Qwen_vl
 std::vector<int> Qwen_vl::url_encode(const std::string& url) {
     std::vector<int> ascii_values(imgpad_len_, img_pad_);
@@ -655,7 +552,7 @@ std::vector<int> Llama2_7b::tokenizer(const std::string& query) {
         ids.insert(ids.end(), {185, 185, 77398, 25});
         return ids;
     }
-    // llama2: <bos>[INST]{query}[/INST]: 1, 5539, 25580, 29962, query, 12452, 25580, 29962
+    // llama2: <s>[INST]{query}[/INST]: 1, 5539, 25580, 29962, query, 12452, 25580, 29962
     ids.insert(ids.begin(), {1, 5539, 25580, 29962});
     ids.insert(ids.end(), {12452, 25580, 29962});
     return ids;
@@ -705,34 +602,6 @@ bool Llama2_7b::is_stop(int token_id) {
     return token_id == 2;
 }
 
-std::vector<int> Qwen2::tokenizer(const std::string& query) {
-    auto ids = tokenizer_encode(query);
-    // auto prompt = "<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n";
-    ids.insert(ids.begin(), {151644, 872, 198});
-    ids.insert(ids.end(), {151645, 198, 151644, 77091, 198});
-    return ids;
-}
-
-bool Qwen2::is_stop(int token_id) {
-    return token_id == 151645 || token_id == 151643;
-}
-
-std::vector<int> TinyLlama::tokenizer(const std::string& query) {
-    auto ids = tokenizer_encode(query);
-    /*
-    <|system|>
-    You are a friendly chatbot who always responds in the style of a pirate</s>
-    <|user|>
-    {query}</s>
-    <|assistant|>
-    */
-    ids.insert(ids.begin(), {1, 529, 29989, 5205, 29989, 29958, 13, 3492, 526, 263, 19780, 13563,
-                             7451, 1058, 2337, 10049, 29879, 297, 278, 3114, 310, 263, 21625,
-                             403, 2, 29871, 13, 29966, 29989, 1792, 29989, 29958, 13});
-    ids.insert(ids.end(), {2, 29871, 13, 29966, 29989, 465, 22137, 29989, 29958, 13});
-    return ids;
-}
-
 std::vector<int> Yi_6b::tokenizer(const std::string& query) {
     auto prompt = "<|im_start|> user\n" + query + "<|im_end|>\n<|im_start|> assistant\n";
     auto ids = tokenizer_encode(prompt);
@@ -742,18 +611,6 @@ std::vector<int> Yi_6b::tokenizer(const std::string& query) {
 bool Yi_6b::is_stop(int token_id) {
     return token_id == 7 || token_id == 64001;
 }
-
-std::vector<int> Llama3_8b::tokenizer(const std::string& query) {
-    // <|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n+query+<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n
-    auto ids = tokenizer_encode(query);
-    ids.insert(ids.begin(), {128000, 128006, 882, 128007, 271});
-    ids.insert(ids.end(), {128009, 128006, 78191, 128007, 271});
-    return ids;
-}
-
-bool Llama3_8b::is_stop(int token_id) {
-    return token_id == 128001 || token_id == 128009;
-}
 #endif
 // Llm end
 
@@ -784,10 +641,6 @@ Embedding* Embedding::createEmbedding(const std::string& path, std::string model
 }
 
 void Embedding::load(const std::string& model_dir) {
-    if (model_dir_ == model_dir) {
-        return;
-    }
-    model_dir_ = model_dir;
     // init
     ScheduleConfig config;
     BackendConfig cpuBackendConfig;
@@ -803,7 +656,6 @@ void Embedding::load(const std::string& model_dir) {
     size_t pos = model_dir.find_last_of("/\\");
     std::string dir_path = (pos != std::string::npos) ? model_dir.substr(0, pos + 1) : "";
     std::string tokenizer_path = dir_path + "/tokenizer.txt";
-    tokenizer_->load(tokenizer_path);
     printf("load tokenizer Done\n");
     // 2. load model
     Module::Config module_config;
@@ -841,17 +693,12 @@ void Embedding::print_speed() {
     printf("##################################\n");
 }
 
-std::vector<int> Embedding::tokenizer_encode(const std::string& input_str) {
-    auto ids = tokenizer_->encode(input_str);
-    return ids;
-}
-
 std::vector<int> Bge::tokenizer(const std::string& query) {
     auto prompt = query;
     if (query.size() <= 256) {
         prompt = "为这个句子生成表示以用于检索相关文章：" + query;
     }
-    auto ids = tokenizer_encode(prompt);
+    auto ids = tokenizer_->encode(prompt);
     ids.insert(ids.begin(), 101);
     ids.push_back(102);
     return ids;
@@ -1191,12 +1038,12 @@ void ChatMemory::summarize(std::shared_ptr<Llm> llm) {
         auto chat_str = content.dump();
         if (!summary.contains(date)) {
             auto summary_prompt = "请总结以下的对话内容，尽可能精炼，提取对话的主题和关键信息。如果有多个关键事件，可以分点总结。对话内容：\n" + chat_str + "\n总结：";
-            auto sum = llm->response_nohistory(summary_prompt);
+            auto sum = llm->response(summary_prompt);
             summary[date] = sum;
         }
         if (!personality.contains(date)) {
             auto personality_prompt = "请根据以下的对话推测总结" + user + "的性格特点和心情，并根据你的推测制定回复策略。对话内容：\n" + chat_str + "\n总结：";
-            auto pers = llm->response_nohistory(personality_prompt);
+            auto pers = llm->response(personality_prompt);
             personality[date] = pers;
         }
     }
@@ -1262,7 +1109,7 @@ void Pipeline::invoke(const std::string& str) {
     auto prompt = build_prompt(str);
     std::cout << prompt;
     if (llm_) {
-        auto res = llm_->response_nohistory(prompt);
+        auto res = llm_->response(prompt);
         Prompt assistant_prompt {PROMPT_TYPE::ASSISTANT, res, {}};
         prompts_.emplace_back(std::move(assistant_prompt));
     }
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 641eaa2e..72e80c2f 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -78,34 +78,128 @@ static inline void to_lower_case(std::string& str) {
     }
 }
 
-Tokenizer* Tokenizer::createTokenizer(const std::string& type) {
-    if (type == "sentencepiece") {
-        return new Sentencepiece();
+Tokenizer* Tokenizer::createTokenizer(const std::string& filename) {
+    Tokenizer* tokenizer = nullptr;
+    // check file
+    std::ifstream tok_file(filename);
+    if (!tok_file.good()) {
+        printf("Failed: can't load tokenzier from: %s.\n", filename.c_str());
+        return tokenizer;
     }
-    if (type == "tiktoken") {
-        return new Tiktoken();
+    // check tokenizer info
+    std::string line;
+    std::getline(tok_file, line);
+    std::istringstream line_str(line);
+    int magic_number, tokenizer_type;
+    line_str >> magic_number;
+    if (magic_number != MAGIC_NUMBER) {
+        printf("Failed: magic number is wrong from: %s.\n", filename.c_str());
+        return tokenizer;
+    }
+    line_str >> tokenizer_type;
+    printf("tokenizer_type = %d\n", tokenizer_type);
+    // create tokenizer
+    switch (tokenizer_type)
+    {
+        case SENTENCEPIECE:
+            tokenizer = new Sentencepiece();
+            break;
+        case TIKTOIKEN:
+            tokenizer = new Tiktoken();
+            break;
+        case BERT:
+            tokenizer = new BertTokenizer();
+            break;
+        case HUGGINGFACE:
+            tokenizer = new HuggingfaceTokenizer();
+            break;
+        default:
+            return tokenizer;
     }
-    if (type == "bert") {
-        return new BertTokenizer();
+    // load special tokens
+    tokenizer->load_special(tok_file);
+    // load vocabs
+    tokenizer->load_vocab(tok_file);
+    tok_file.close();
+    return tokenizer;
+}
+
+bool Tokenizer::is_stop(int token) {
+    return std::find(stop_tokens_.begin(), stop_tokens_.end(), token) != stop_tokens_.end();
+}
+
+void Tokenizer::load_special(std::ifstream& tok_file) {
+    std::string line;
+    std::getline(tok_file, line);
+    std::istringstream line_str(line);
+    int special_num, stop_num, prefix_num;
+    line_str >> special_num >> stop_num >> prefix_num;
+    std::getline(tok_file, line);
+    std::istringstream specail_line(line);
+    if (special_num) {
+        // load special tokens
+        special_tokens_.resize(special_num);
+        for (int i = 0; i < special_num; i++) {
+            specail_line >> special_tokens_[i];
+        }
     }
-    if (type == "huggingface") {
-        return new HuggingfaceTokenizer();
+    if (stop_num) {
+        // load stop tokens
+        stop_tokens_.resize(stop_num);
+        for (int i = 0; i < stop_num; i++) {
+            specail_line >> stop_tokens_[i];
+        }
+    }
+    if (prefix_num) {
+        // load prefix tokens
+        prefix_tokens_.resize(prefix_num);
+        for (int i = 0; i < prefix_num; i++) {
+            specail_line >> prefix_tokens_[i];
+        }
     }
-    return nullptr;
 }
 
-bool Sentencepiece::load(const std::string& filename) {
-    std::ifstream tok_file(filename);
+std::vector<int> Tokenizer::encode(const std::string& str) {
+    std::vector<int> ids = prefix_tokens_;
+    if (!special_tokens_.empty()) {
+        std::string text = str;
+        while (true) {
+            bool contain_special = false;
+            for (auto special_id : special_tokens_) {
+                const auto& token = decode(special_id);
+                if (token.empty()) continue;
+                auto pos = text.find(token);
+                if (pos != std::string::npos) {
+                    contain_special = true;
+                    if (pos > 0) encode(text.substr(0, pos), ids);
+                    ids.push_back(special_id);
+                    text = text.substr(pos + token.size(), -1);
+                }
+            }
+            if (!contain_special) break;
+        }
+        encode(text, ids);
+    } else {
+        encode(str, ids);
+    }
+    return ids;
+}
+
+bool Sentencepiece::load_vocab(std::ifstream& tok_file) {
     std::string line, token;
+    std::getline(tok_file, line);
+    int vocab_len = std::stoi(line);
     float score;
-    int index = 0, type;
-    while (std::getline(tok_file, line)) {
+    int type;
+    sentence_pieces_.resize(vocab_len);
+    for (int index = 0; index < vocab_len; index++) {
+        std::getline(tok_file, line);
         std::istringstream line_str(line);
         line_str >> token >> score >> type;
         token = base64_decode(token);
         auto piece_type = static_cast<PieceType>(type);
         SentencePiece piece {token, score, piece_type};
-        sentence_pieces_.emplace_back(std::move(piece));
+        sentence_pieces_[index] = std::move(piece);
         if (piece_type == PieceType::NORMAL) {
             pieces_.insert({token, index});
         } else {
@@ -114,9 +208,7 @@ bool Sentencepiece::load(const std::string& filename) {
                 unk_id_ = index;
             }
         }
-        index++;
     }
-    tok_file.close();
     return true;
 }
 
@@ -286,8 +378,7 @@ Sentencepiece::EncodeResult Sentencepiece::bpe_encode(std::string_view normalize
     return output;
 }
 
-std::vector<int> Sentencepiece::encode(const std::string& str) {
-    std::vector<int> ids;
+void Sentencepiece::encode(const std::string& str, std::vector<int>& ids) {
     auto result = bpe_encode(str);
     size_t consumed = 0;
     for (const auto &p : result) {
@@ -307,7 +398,6 @@ std::vector<int> Sentencepiece::encode(const std::string& str) {
             ids.push_back(id);
         }
     }
-    return ids;
 }
 
 std::string Sentencepiece::decode(int id) {
@@ -331,26 +421,24 @@ bool Sentencepiece::is_control(int id) const {
     return sentence_pieces_[id].type == PieceType::CONTROL;
 }
 
-bool Tiktoken::load(const std::string& filename) {
-    std::ifstream tok_file(filename);
-    if (!tok_file.good()) {
-        printf("Failed: can't load tokenzier from: %s.\n", filename.c_str());
-        return false;
-    }
-    std::string token;
-    while (tok_file >> token) {
-        token = base64_decode(token);
-        encoder_[token] = static_cast<int>(decoder_.size());
-        decoder_.push_back(token);
+bool Tiktoken::load_vocab(std::ifstream& tok_file) {
+    std::string line;
+    std::getline(tok_file, line);
+    int vocab_len = std::stoi(line);
+    // load vocab
+    decoder_.resize(vocab_len);
+    for (int i = 0; i < vocab_len; i++) {
+        std::getline(tok_file, line);
+        auto token = base64_decode(line);
+        encoder_.insert({token, i});
+        decoder_[i] = token;
     }
-    tok_file.close();
     return true;
 }
 
-std::vector<int> Tiktoken::encode(const std::string& str) {
-    std::vector<int> ids;
+void Tiktoken::encode(const std::string& str, std::vector<int>& ids) {
     if (str.empty()) {
-        return ids;
+        return;
     }
     size_t i = 0;
     while (i < str.size()) {
@@ -378,10 +466,9 @@ std::vector<int> Tiktoken::encode(const std::string& str) {
             // If no matching symbol is found, this typically means an error in the encoding
             // or the input text contains characters that the encoder doesn't know how to handle
             std::cerr << "Error: No encoding found for the sequence starting at position " << i << std::endl;
-            return {};
+            return;
         }
     }
-    return ids;
 }
 
 std::string Tiktoken::decode(int id) {
@@ -425,8 +512,7 @@ std::vector<int> BertTokenizer::word_piece(const std::string& token) {
     return ids;
 }
 
-std::vector<int> BertTokenizer::encode(const std::string& str) {
-    std::vector<int> ids;
+void BertTokenizer::encode(const std::string& str, std::vector<int>& ids) {
     std::vector<std::string> tokens;
     std::string current_token;
     size_t i = 0;
@@ -476,7 +562,6 @@ std::vector<int> BertTokenizer::encode(const std::string& str) {
             ids.push_back(id);
         }
     }
-    return ids;
 }
 
 std::wstring utf8_to_wstring(const std::string& str) {
@@ -500,8 +585,7 @@ void byte_encode_token(const std::string& token,
   }
 }
 
-bool HuggingfaceTokenizer::load(const std::string& filename) {
-    std::ifstream tok_file(filename);
+bool HuggingfaceTokenizer::load_vocab(std::ifstream& tok_file) {
     std::string line, token;
     // get nums
     int vocab_len, merge_len;
@@ -522,7 +606,6 @@ bool HuggingfaceTokenizer::load(const std::string& filename) {
         bpe_ranks_.insert({{utf8_to_wstring(line.substr(0, d)),
                             utf8_to_wstring(line.substr(d + 1))}, i});
     }
-    tok_file.close();
     // bytes_to_unicode
      auto _insert_range = [=](int start, int end) {
         for (int c = start; c <= end; c++) {
@@ -617,7 +700,7 @@ void HuggingfaceTokenizer::bpe(const std::wstring& token, const BPERanks& bpe_ra
     }
 }
 
-std::vector<int> HuggingfaceTokenizer::encode(const std::string& str) {
+void HuggingfaceTokenizer::encode(const std::string& str, std::vector<int>& ids) {
     std::regex re("('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\\s\\w]+|\\s+)");
     std::string input = str;
     std::vector<std::string> result;
@@ -638,21 +721,22 @@ std::vector<int> HuggingfaceTokenizer::encode(const std::string& str) {
             result.push_back(wstring_to_utf8(ws));
         }
     }
-    std::vector<int> ids;
     for (auto s : result) {
         ids.push_back(encoder_.at(s));
     }
-    return ids;
 }
 
 std::string HuggingfaceTokenizer::decode(int id) {
+    // printf("decode id = %d, %lu, %s#\n", id, decoder_.size(), decoder_.at(id).c_str());
     if (id >= decoder_.size()) {
         return "";
     }
     std::wstring w = utf8_to_wstring(decoder_.at(id));
     std::string r;
     for (wchar_t c : w) {
-        r.push_back(char(u2b_.at(c)));
+        if (u2b_.find(c) != u2b_.end()) {
+            r.push_back(char(u2b_.at(c)));
+        }
     }
     return r;
 }