From e1ac2c68e0846bcf5fc96a23dfc2fb03e67bafe8 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 04:23:32 +0200 Subject: [PATCH] model: dbrx: convert fix tokenizer --- convert-hf-to-gguf.py | 49 +++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 6bbe8a0a618f89..744c163a51dca7 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1448,35 +1448,48 @@ def set_gguf_parameters(self): def _set_vocab_gpt2(self): dir_model = self.dir_model tokens: list[str] = [] + scores: list[float] = [] toktypes: list[int] = [] + # REVIEW: Not tested yet, need to deep dive this tiktoken from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = tokenizer.vocab_size - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()} + for token_id in range(tokenizer.vocab_size): + piece = tokenizer.id_to_piece(token_id) + text = piece.encode("utf-8") + score = tokenizer.get_score(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + added_vocab = tokenizer.get_added_vocab() + for key in added_vocab: + key = key.encode("utf-8") + if key not in tokens: + tokens.append(key) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) - # REVIEW: Not tested yet, need to deep dive this tiktoken - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) + assert len(tokens) == tokenizer.vocab_size - self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer)