From 3ee82bdeb6076ec030967c1b6297ba5beb118eea Mon Sep 17 00:00:00 2001 From: Kelly Marchisio <3612904+kellymarchisio@users.noreply.github.com> Date: Fri, 28 Jul 2023 15:55:55 +0100 Subject: [PATCH] Handle when precompiled charsmap is empty --- .../implementations/sentencepiece_unigram.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py index 3d26a85e5..0ac9b534d 100644 --- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py +++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py @@ -173,12 +173,17 @@ def from_spm(filename: str): tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback)) - tokenizer.normalizer = normalizers.Sequence( - [ - normalizers.Precompiled(precompiled_charsmap), - normalizers.Replace(Regex(" {2,}"), " "), - ] - ) + if precompiled_charsmap: + tokenizer.normalizer = normalizers.Sequence( + [ + normalizers.Precompiled(precompiled_charsmap), + normalizers.Replace(Regex(" {2,}"), " "), + ] + ) + else: + tokenizer.normalizer = normalizers.Sequence( + [normalizers.Replace(Regex(" {2,}"), " ")] + ) tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)