Skip to content

Commit

Permalink
Sync with upsteam source code, add demo for DeepSeek-R1 (#150)
Browse files Browse the repository at this point in the history
* sync with upstream source code

* demo: add qwen and deepseek models

* fix chat template truncated
  • Loading branch information
ngxson authored Jan 22, 2025
1 parent 30adc2a commit e05af9e
Show file tree
Hide file tree
Showing 14 changed files with 131 additions and 49 deletions.
115 changes: 83 additions & 32 deletions actions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ json dump_metadata(app_t &app)
continue;
if (res > buf.size())
{
buf.resize(res);
buf.resize(res + 1);
res = llama_model_meta_val_str_by_index(app.model, i, buf.data(), buf.size());
}
val = std::string(buf.data(), res);
Expand All @@ -149,7 +149,7 @@ json dump_metadata(app_t &app)
continue;
if (res > buf.size())
{
buf.resize(res);
buf.resize(res + 1);
res = llama_model_meta_key_by_index(app.model, i, buf.data(), buf.size());
}
key = std::string(buf.data(), res);
Expand Down Expand Up @@ -250,8 +250,10 @@ json action_load(app_t &app, json &body)
}
int n_vocab = llama_vocab_n_tokens(app.vocab);
llama_tokens list_tokens_eog;
for (int i = 0; i < n_vocab; i++) {
if (llama_vocab_is_eog(app.vocab, i)) {
for (int i = 0; i < n_vocab; i++)
{
if (llama_vocab_is_eog(app.vocab, i))
{
list_tokens_eog.push_back(i);
}
}
Expand Down Expand Up @@ -595,34 +597,6 @@ json action_embeddings(app_t &app, json &body)
};
}

// apply chat template
json action_chat_format(app_t &app, json &body)
{
std::string tmpl = body.contains("tmpl") ? body["tmpl"] : "";
bool add_ass = body.contains("add_ass") ? body.at("add_ass").get<bool>() : false;
if (!body.contains("messages"))
{
return json{{"error", "messages is required"}};
}
std::vector<common_chat_msg> chat;
for (auto &item : body["messages"])
{
chat.push_back({item["role"], item["content"]});
}
try
{
std::string formatted_chat = common_chat_apply_template(app.model, tmpl, chat, add_ass);
return json{
{"success", true},
{"formatted_chat", formatted_chat},
};
}
catch (const std::exception &e)
{
return json{{"error", e.what()}};
}
}

// remove tokens in kv, for context-shifting
json action_kv_remove(app_t &app, json &body)
{
Expand Down Expand Up @@ -709,3 +683,80 @@ json action_current_status(app_t &app, json &body)
{"tokens", app.tokens},
};
}

//////////////////////////////////////////

// because we can't support jinja for now, we temporary use an old version of common_chat_apply_template
// TODO: support jinja
std::string common_chat_apply_template_old(const struct llama_model *model,
const std::string &tmpl,
const std::vector<common_chat_msg> &msgs,
bool add_ass)
{
int alloc_size = 0;
bool fallback = false; // indicate if we must fallback to default chatml
std::vector<llama_chat_message> chat;
for (const auto &msg : msgs)
{
chat.push_back({msg.role.c_str(), msg.content.c_str()});
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
}

const char *ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model, nullptr) : tmpl.c_str();
std::vector<char> buf(alloc_size);

// run the first time to get the total output length
int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());

// error: chat template is not supported
if (res < 0)
{
if (ptr_tmpl != nullptr)
{
throw std::runtime_error("this custom template is not supported");
}
// If the built-in template is not supported, we default to chatml
res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
fallback = true;
}

// if it turns out that our buffer is too small, we resize it
if ((size_t)res > buf.size())
{
buf.resize(res);
res = llama_chat_apply_template(
fallback ? "chatml" : ptr_tmpl,
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
}

std::string formatted_chat(buf.data(), res);
return formatted_chat;
}

// apply chat template
json action_chat_format(app_t &app, json &body)
{
std::string tmpl = body.contains("tmpl") ? body["tmpl"] : "";
bool add_ass = body.contains("add_ass") ? body.at("add_ass").get<bool>() : false;
if (!body.contains("messages"))
{
return json{{"error", "messages is required"}};
}
std::vector<common_chat_msg> chat;
for (auto &item : body["messages"])
{
chat.push_back({item["role"], item["content"]});
}
try
{
std::string formatted_chat = common_chat_apply_template_old(app.model, tmpl, chat, add_ass);
return json{
{"success", true},
{"formatted_chat", formatted_chat},
};
}
catch (const std::exception &e)
{
return json{{"error", e.what()}};
}
}
14 changes: 10 additions & 4 deletions examples/main/src/components/ChatScreen.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,16 @@ export default function ChatScreen() {
if (!loadedModel) {
throw new Error('loadedModel is null');
}
const formattedChat = await formatChat(getWllamaInstance(), [
...currHistory,
userMsg,
]);
let formattedChat: string;
try {
formattedChat = await formatChat(getWllamaInstance(), [
...currHistory,
userMsg,
]);
} catch (e) {
alert(`Error while formatting chat: ${(e as any)?.message ?? 'unknown'}`);
throw e;
}
console.log({ formattedChat });
await createCompletion(formattedChat, (newContent) => {
editMessageInConversation(convId, assistantMsg.id, newContent);
Expand Down
8 changes: 8 additions & 0 deletions examples/main/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,18 @@ export const LIST_MODELS = [
url: 'https://huggingface.co/ngxson/SmolLM2-360M-Instruct-Q8_0-GGUF/resolve/main/smollm2-360m-instruct-q8_0.gguf',
size: 386404992,
},
{
url: 'https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q8_0.gguf',
size: 675710816,
},
{
url: 'https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/resolve/main/llama-3.2-1b-instruct-q4_k_m.gguf',
size: 807690656,
},
{
url: 'https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf',
size: 924456032,
},
{
url: 'https://huggingface.co/ngxson/wllama-split-models/resolve/main/qwen2-1_5b-instruct-q4_k_m-00001-of-00004.gguf',
size: 986046272,
Expand Down
2 changes: 1 addition & 1 deletion examples/main/src/utils/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ export interface InferenceParams {
export interface Message {
id: number;
content: string;
role: 'user' | 'assistant';
role: 'system' | 'user' | 'assistant';
}

export interface Conversation {
Expand Down
23 changes: 20 additions & 3 deletions examples/main/src/utils/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,26 @@ export const formatChat = async (
modelWllama: Wllama,
messages: Message[]
): Promise<string> => {
const template = new Template(
modelWllama.getChatTemplate() ?? DEFAULT_CHAT_TEMPLATE
);
const templateStr = modelWllama.getChatTemplate() ?? DEFAULT_CHAT_TEMPLATE;
// dirty patch for DeepSeek model (crash on @huggingface/jinja)
const isDeepSeekR1 =
templateStr.match(/<Assistant>/) &&
templateStr.match(/<User>/) &&
templateStr.match(/<\/think>/);
if (isDeepSeekR1) {
let result = '';
for (const message of messages) {
if (message.role === 'system') {
result += `${message.content}\n\n`;
} else if (message.role === 'user') {
result += `<|User|>${message.content}`;
} else {
result += `<|Assistant|>${message.content.split('</think>').pop()}<|end▁of▁sentence|>`;
}
}
return result + '<|Assistant|>';
}
const template = new Template(templateStr);
const bos_token: string = textDecoder.decode(
await modelWllama.detokenize([modelWllama.getBOS()])
);
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
Submodule llama.cpp updated 126 files
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@wllama/wllama",
"version": "2.1.2",
"version": "2.1.3",
"description": "WebAssembly binding for llama.cpp - Enabling on-browser LLM inference",
"main": "index.js",
"type": "module",
Expand Down
2 changes: 1 addition & 1 deletion scripts/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ services:
mkdir -p wasm/single-thread
cd wasm/single-thread
export SHARED_EMCC_CFLAGS="--no-entry -O3 -msimd128 -fno-rtti -DNDEBUG -flto=full -fwasm-exceptions -sEXPORT_ALL=1 -sEXPORT_ES6=0 -sMODULARIZE=0 -sINITIAL_MEMORY=128MB -sMAXIMUM_MEMORY=4096MB -sALLOW_MEMORY_GROWTH=1 -sFORCE_FILESYSTEM=1 -sEXPORTED_FUNCTIONS=_main,_wllama_start,_wllama_action,_wllama_exit,_wllama_debug -sEXPORTED_RUNTIME_METHODS=ccall,cwrap -sNO_EXIT_RUNTIME=1"
export SHARED_EMCC_CFLAGS="--no-entry -O3 -msimd128 -fno-rtti -DNDEBUG -flto=full -frtti -fwasm-exceptions -sEXPORT_ALL=1 -sEXPORT_ES6=0 -sMODULARIZE=0 -sINITIAL_MEMORY=128MB -sMAXIMUM_MEMORY=4096MB -sALLOW_MEMORY_GROWTH=1 -sFORCE_FILESYSTEM=1 -sEXPORTED_FUNCTIONS=_main,_wllama_start,_wllama_action,_wllama_exit,_wllama_debug -sEXPORTED_RUNTIME_METHODS=ccall,cwrap -sNO_EXIT_RUNTIME=1"
# emcc --clear-cache
Expand Down
2 changes: 1 addition & 1 deletion src/multi-thread/wllama.js

Large diffs are not rendered by default.

Binary file modified src/multi-thread/wllama.wasm
Binary file not shown.
2 changes: 1 addition & 1 deletion src/single-thread/wllama.js

Large diffs are not rendered by default.

Binary file modified src/single-thread/wllama.wasm
Binary file not shown.
4 changes: 2 additions & 2 deletions src/wasm-from-cdn.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
// Do not edit this file directly

const WasmFromCDN = {
'single-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].2/src/single-thread/wllama.wasm',
'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].2/src/multi-thread/wllama.wasm',
'single-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].3/src/single-thread/wllama.wasm',
'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].3/src/multi-thread/wllama.wasm',
};

export default WasmFromCDN;
4 changes: 2 additions & 2 deletions src/workers-code/generated.ts

Large diffs are not rendered by default.

0 comments on commit e05af9e

Please sign in to comment.