Sync with upsteam source code, add demo for DeepSeek-R1 (#150)

* sync with upstream source code * demo: add qwen and deepseek models * fix chat template truncated
ngxson · Jan 22, 2025 · e05af9e · e05af9e
1 parent 30adc2a
commit e05af9e
Show file tree

Hide file tree

Showing 14 changed files with 131 additions and 49 deletions.
diff --git a/actions.hpp b/actions.hpp
@@ -140,7 +140,7 @@ json dump_metadata(app_t &app)
       continue;
     if (res > buf.size())
     {
-      buf.resize(res);
+      buf.resize(res + 1);
       res = llama_model_meta_val_str_by_index(app.model, i, buf.data(), buf.size());
     }
     val = std::string(buf.data(), res);
@@ -149,7 +149,7 @@ json dump_metadata(app_t &app)
       continue;
     if (res > buf.size())
     {
-      buf.resize(res);
+      buf.resize(res + 1);
       res = llama_model_meta_key_by_index(app.model, i, buf.data(), buf.size());
     }
     key = std::string(buf.data(), res);
@@ -250,8 +250,10 @@ json action_load(app_t &app, json &body)
   }
   int n_vocab = llama_vocab_n_tokens(app.vocab);
   llama_tokens list_tokens_eog;
-  for (int i = 0; i < n_vocab; i++) {
-    if (llama_vocab_is_eog(app.vocab, i)) {
+  for (int i = 0; i < n_vocab; i++)
+  {
+    if (llama_vocab_is_eog(app.vocab, i))
+    {
       list_tokens_eog.push_back(i);
     }
   }
@@ -595,34 +597,6 @@ json action_embeddings(app_t &app, json &body)
   };
 }
 
-// apply chat template
-json action_chat_format(app_t &app, json &body)
-{
-  std::string tmpl = body.contains("tmpl") ? body["tmpl"] : "";
-  bool add_ass = body.contains("add_ass") ? body.at("add_ass").get<bool>() : false;
-  if (!body.contains("messages"))
-  {
-    return json{{"error", "messages is required"}};
-  }
-  std::vector<common_chat_msg> chat;
-  for (auto &item : body["messages"])
-  {
-    chat.push_back({item["role"], item["content"]});
-  }
-  try
-  {
-    std::string formatted_chat = common_chat_apply_template(app.model, tmpl, chat, add_ass);
-    return json{
-        {"success", true},
-        {"formatted_chat", formatted_chat},
-    };
-  }
-  catch (const std::exception &e)
-  {
-    return json{{"error", e.what()}};
-  }
-}
-
 // remove tokens in kv, for context-shifting
 json action_kv_remove(app_t &app, json &body)
 {
@@ -709,3 +683,80 @@ json action_current_status(app_t &app, json &body)
       {"tokens", app.tokens},
   };
 }
+
+//////////////////////////////////////////
+
+// because we can't support jinja for now, we temporary use an old version of common_chat_apply_template
+// TODO: support jinja
+std::string common_chat_apply_template_old(const struct llama_model *model,
+                                       const std::string &tmpl,
+                                       const std::vector<common_chat_msg> &msgs,
+                                       bool add_ass)
+{
+  int alloc_size = 0;
+  bool fallback = false; // indicate if we must fallback to default chatml
+  std::vector<llama_chat_message> chat;
+  for (const auto &msg : msgs)
+  {
+    chat.push_back({msg.role.c_str(), msg.content.c_str()});
+    alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
+  }
+
+  const char *ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model, nullptr) : tmpl.c_str();
+  std::vector<char> buf(alloc_size);
+
+  // run the first time to get the total output length
+  int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+
+  // error: chat template is not supported
+  if (res < 0)
+  {
+    if (ptr_tmpl != nullptr)
+    {
+      throw std::runtime_error("this custom template is not supported");
+    }
+    // If the built-in template is not supported, we default to chatml
+    res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+    fallback = true;
+  }
+
+  // if it turns out that our buffer is too small, we resize it
+  if ((size_t)res > buf.size())
+  {
+    buf.resize(res);
+    res = llama_chat_apply_template(
+        fallback ? "chatml" : ptr_tmpl,
+        chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+  }
+
+  std::string formatted_chat(buf.data(), res);
+  return formatted_chat;
+}
+
+// apply chat template
+json action_chat_format(app_t &app, json &body)
+{
+  std::string tmpl = body.contains("tmpl") ? body["tmpl"] : "";
+  bool add_ass = body.contains("add_ass") ? body.at("add_ass").get<bool>() : false;
+  if (!body.contains("messages"))
+  {
+    return json{{"error", "messages is required"}};
+  }
+  std::vector<common_chat_msg> chat;
+  for (auto &item : body["messages"])
+  {
+    chat.push_back({item["role"], item["content"]});
+  }
+  try
+  {
+    std::string formatted_chat = common_chat_apply_template_old(app.model, tmpl, chat, add_ass);
+    return json{
+        {"success", true},
+        {"formatted_chat", formatted_chat},
+    };
+  }
+  catch (const std::exception &e)
+  {
+    return json{{"error", e.what()}};
+  }
+}
diff --git a/examples/main/src/components/ChatScreen.tsx b/examples/main/src/components/ChatScreen.tsx
@@ -67,10 +67,16 @@ export default function ChatScreen() {
     if (!loadedModel) {
       throw new Error('loadedModel is null');
     }
-    const formattedChat = await formatChat(getWllamaInstance(), [
-      ...currHistory,
-      userMsg,
-    ]);
+    let formattedChat: string;
+    try {
+      formattedChat = await formatChat(getWllamaInstance(), [
+        ...currHistory,
+        userMsg,
+      ]);
+    } catch (e) {
+      alert(`Error while formatting chat: ${(e as any)?.message ?? 'unknown'}`);
+      throw e;
+    }
     console.log({ formattedChat });
     await createCompletion(formattedChat, (newContent) => {
       editMessageInConversation(convId, assistantMsg.id, newContent);

diff --git a/examples/main/src/config.ts b/examples/main/src/config.ts
@@ -18,10 +18,18 @@ export const LIST_MODELS = [
     url: 'https://huggingface.co/ngxson/SmolLM2-360M-Instruct-Q8_0-GGUF/resolve/main/smollm2-360m-instruct-q8_0.gguf',
     size: 386404992,
   },
+  {
+    url: 'https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q8_0.gguf',
+    size: 675710816,
+  },
   {
     url: 'https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/resolve/main/llama-3.2-1b-instruct-q4_k_m.gguf',
     size: 807690656,
   },
+  {
+    url: 'https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf',
+    size: 924456032,
+  },
   {
     url: 'https://huggingface.co/ngxson/wllama-split-models/resolve/main/qwen2-1_5b-instruct-q4_k_m-00001-of-00004.gguf',
     size: 986046272,

diff --git a/examples/main/src/utils/types.ts b/examples/main/src/utils/types.ts
@@ -29,7 +29,7 @@ export interface InferenceParams {
 export interface Message {
   id: number;
   content: string;
-  role: 'user' | 'assistant';
+  role: 'system' | 'user' | 'assistant';
 }
 
 export interface Conversation {

diff --git a/examples/main/src/utils/utils.ts b/examples/main/src/utils/utils.ts
@@ -38,9 +38,26 @@ export const formatChat = async (
   modelWllama: Wllama,
   messages: Message[]
 ): Promise<string> => {
-  const template = new Template(
-    modelWllama.getChatTemplate() ?? DEFAULT_CHAT_TEMPLATE
-  );
+  const templateStr = modelWllama.getChatTemplate() ?? DEFAULT_CHAT_TEMPLATE;
+  // dirty patch for DeepSeek model (crash on @huggingface/jinja)
+  const isDeepSeekR1 =
+    templateStr.match(/<｜Assistant｜>/) &&
+    templateStr.match(/<｜User｜>/) &&
+    templateStr.match(/<\/think>/);
+  if (isDeepSeekR1) {
+    let result = '';
+    for (const message of messages) {
+      if (message.role === 'system') {
+        result += `${message.content}\n\n`;
+      } else if (message.role === 'user') {
+        result += `<｜User｜>${message.content}`;
+      } else {
+        result += `<｜Assistant｜>${message.content.split('</think>').pop()}<｜end▁of▁sentence｜>`;
+      }
+    }
+    return result + '<｜Assistant｜>';
+  }
+  const template = new Template(templateStr);
   const bos_token: string = textDecoder.decode(
     await modelWllama.detokenize([modelWllama.getBOS()])
   );

diff --git a/llama.cpp b/llama.cpp
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@wllama/wllama",
-  "version": "2.1.2",
+  "version": "2.1.3",
   "description": "WebAssembly binding for llama.cpp - Enabling on-browser LLM inference",
   "main": "index.js",
   "type": "module",

diff --git a/scripts/docker-compose.yml b/scripts/docker-compose.yml
@@ -19,7 +19,7 @@ services:
         mkdir -p wasm/single-thread
         cd wasm/single-thread
 
-        export SHARED_EMCC_CFLAGS="--no-entry -O3 -msimd128 -fno-rtti -DNDEBUG -flto=full -fwasm-exceptions -sEXPORT_ALL=1 -sEXPORT_ES6=0 -sMODULARIZE=0 -sINITIAL_MEMORY=128MB -sMAXIMUM_MEMORY=4096MB -sALLOW_MEMORY_GROWTH=1 -sFORCE_FILESYSTEM=1 -sEXPORTED_FUNCTIONS=_main,_wllama_start,_wllama_action,_wllama_exit,_wllama_debug -sEXPORTED_RUNTIME_METHODS=ccall,cwrap -sNO_EXIT_RUNTIME=1"
+        export SHARED_EMCC_CFLAGS="--no-entry -O3 -msimd128 -fno-rtti -DNDEBUG -flto=full -frtti -fwasm-exceptions -sEXPORT_ALL=1 -sEXPORT_ES6=0 -sMODULARIZE=0 -sINITIAL_MEMORY=128MB -sMAXIMUM_MEMORY=4096MB -sALLOW_MEMORY_GROWTH=1 -sFORCE_FILESYSTEM=1 -sEXPORTED_FUNCTIONS=_main,_wllama_start,_wllama_action,_wllama_exit,_wllama_debug -sEXPORTED_RUNTIME_METHODS=ccall,cwrap -sNO_EXIT_RUNTIME=1"
 
         # emcc --clear-cache
 

diff --git a/src/multi-thread/wllama.js b/src/multi-thread/wllama.js
diff --git a/src/multi-thread/wllama.wasm b/src/multi-thread/wllama.wasm
diff --git a/src/single-thread/wllama.js b/src/single-thread/wllama.js
diff --git a/src/single-thread/wllama.wasm b/src/single-thread/wllama.wasm
diff --git a/src/wasm-from-cdn.ts b/src/wasm-from-cdn.ts
@@ -2,8 +2,8 @@
 // Do not edit this file directly
 
 const WasmFromCDN = {
-  'single-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].2/src/single-thread/wllama.wasm',
-  'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].2/src/multi-thread/wllama.wasm',
+  'single-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].3/src/single-thread/wllama.wasm',
+  'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].3/src/multi-thread/wllama.wasm',
 };
 
 export default WasmFromCDN;
diff --git a/src/workers-code/generated.ts b/src/workers-code/generated.ts