Skip to content

Commit

Permalink
model: dbrx: convert fix tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
phymbert committed Apr 7, 2024
1 parent 06a59ab commit e1ac2c6
Showing 1 changed file with 31 additions and 18 deletions.
49 changes: 31 additions & 18 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1448,35 +1448,48 @@ def set_gguf_parameters(self):
def _set_vocab_gpt2(self):
dir_model = self.dir_model
tokens: list[str] = []
scores: list[float] = []
toktypes: list[int] = []

# REVIEW: Not tested yet, need to deep dive this tiktoken
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = tokenizer.vocab_size

reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()}
for token_id in range(tokenizer.vocab_size):
piece = tokenizer.id_to_piece(token_id)
text = piece.encode("utf-8")
score = tokenizer.get_score(token_id)

toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.is_unknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.is_control(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.is_unused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.is_byte(token_id):
toktype = SentencePieceTokenTypes.BYTE

tokens.append(text)
scores.append(score)
toktypes.append(toktype)

added_vocab = tokenizer.get_added_vocab()
for key in added_vocab:
key = key.encode("utf-8")
if key not in tokens:
tokens.append(key)
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)

# REVIEW: Not tested yet, need to deep dive this tiktoken
for i in range(vocab_size):
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
assert len(tokens) == tokenizer.vocab_size

self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)


Expand Down

0 comments on commit e1ac2c6

Please sign in to comment.