diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 370fda087..9a1fd7272 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -566,6 +566,9 @@ def test_re_assign_tokens(self): tokenizer = Tokenizer.from_pretrained("t5-base") tokenizer.assign_tokens({"": "my_new_token"}) assert tokenizer.decode([32099]) == "my_new_token" + assert tokenizer.encode("my_new_token").tokens == ["my_new_token", ""] + assert tokenizer.encode("my_new_token").ids == [32099, 1] + assert tokenizer.encode("").ids == [0, 1] assert tokenizer.encode("").tokens == ["▁", "<", "extra", "_", "i", "d", "_", "0", ">", ""] assert "my_new_token" in tokenizer.get_vocab(True).keys() diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 984201075..e22249048 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -321,6 +321,10 @@ impl AddedVocabulary { .unwrap() .entry(id) .and_modify(|t| *t = new_token.clone()); // Replace entire entry with new_token + self.added_tokens_map + .lock() + .unwrap() + .remove(old_token.content.as_str()); self.refresh_added_tokens(model, normalizer); } else { error!(