From 122a933ce46f2cfa3186eb7d077178f38bca7809 Mon Sep 17 00:00:00 2001 From: Justin Zhao Date: Tue, 17 Oct 2023 20:51:13 +0000 Subject: [PATCH] Simplify how we set pad token and pad token ID. --- ludwig/utils/tokenizers.py | 52 +++++++++++--------------------------- 1 file changed, 15 insertions(+), 37 deletions(-) diff --git a/ludwig/utils/tokenizers.py b/ludwig/utils/tokenizers.py index 5c585d4ecaf..e7bce136735 100644 --- a/ludwig/utils/tokenizers.py +++ b/ludwig/utils/tokenizers.py @@ -840,43 +840,21 @@ def get_unk_token(self) -> str: return self.tokenizer.unk_token def _set_pad_token(self) -> None: - """Sets the pad token and pad token ID for the tokenizer.""" - - from transformers import ( - CodeLlamaTokenizer, - CodeLlamaTokenizerFast, - GPT2Tokenizer, - GPT2TokenizerFast, - LlamaTokenizer, - LlamaTokenizerFast, - ) - - # Tokenizers might have the pad token id attribute since they tend to use the same base class, but - # it can be set to None so we check for this explicitly. - if hasattr(self.tokenizer, "pad_token_id") and self.tokenizer.pad_token_id is not None: - return - - # HACK(geoffrey): gpt2 has no pad token. Recommendation is to use eos token instead. - # https://github.com/huggingface/transformers/issues/2630#issuecomment-1290809338 - # https://github.com/huggingface/transformers/issues/2648#issuecomment-616177044 - if any( - isinstance(self.tokenizer, t) - for t in [ - GPT2Tokenizer, - GPT2TokenizerFast, - LlamaTokenizer, - LlamaTokenizerFast, - CodeLlamaTokenizer, - CodeLlamaTokenizerFast, - ] - ): - if hasattr(self.tokenizer, "eos_token") and self.tokenizer.eos_token is not None: - logger.warning("No padding token id found. Using eos_token as pad_token.") - self.tokenizer.pad_token = self.tokenizer.eos_token - self.tokenizer.pad_token_id = self.tokenizer.eos_token_id - - # Incase any HF tokenizer does not have pad token ID, just default to using 0 - # as the pad_token_id. + """Sets the pad token and pad token ID for the tokenizer. + If there is no pad token, then set one by default. + If there is no pad token index, then set it to 0. + Notes: + - (geoffrey): gpt2 has no pad token. Recommendation is to use eos token instead. + - https://github.com/huggingface/transformers/issues/2630#issuecomment-1290809338 + - https://github.com/huggingface/transformers/issues/2648#issuecomment-616177044 + - (Justin): Using the EOS token in place of the pad token causes an issue with HF model.generate() when + there are multiple examples in the batch. + - https://github.com/facebookresearch/llama/issues/380#issuecomment-1716832417 + - Recommendation is to set a separate '[PAD]' or '' token. + """ + if self.tokenizer.pad_token is None: + logger.warning("No padding token found. Using '[PAD]' as the pad token.") + self.tokenizer.pad_token = "[PAD]" if self.tokenizer.pad_token_id is None: logger.warning("No padding token id found. Using 0 as pad token id.") self.tokenizer.pad_token_id = 0