From ed34ffd3342dd1c6b1226948297529dc2d6d2a8c Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Fri, 4 Oct 2024 15:00:35 +0200 Subject: [PATCH] add a small test --- bindings/python/py_src/tokenizers/__init__.pyi | 2 +- bindings/python/tests/bindings/test_tokenizer.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi index a480923ef..bc2cb0cab 100644 --- a/bindings/python/py_src/tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/__init__.pyi @@ -725,7 +725,7 @@ class Tokenizer: """ pass - def assing_tokens(self, old_tokens, new_tokens): + def assign_tokens(self, old_tokens, new_tokens): """ Add the given tokens to the vocabulary diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 2118709a0..370fda087 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -562,6 +562,13 @@ def test_setting_to_none(self): tokenizer.pre_tokenizer = None assert tokenizer.pre_tokenizer == None + def test_re_assign_tokens(self): + tokenizer = Tokenizer.from_pretrained("t5-base") + tokenizer.assign_tokens({"": "my_new_token"}) + assert tokenizer.decode([32099]) == "my_new_token" + assert tokenizer.encode("").tokens == ["▁", "<", "extra", "_", "i", "d", "_", "0", ">", ""] + assert "my_new_token" in tokenizer.get_vocab(True).keys() + class TestTokenizerRepr: def test_repr(self):