From 122a933ce46f2cfa3186eb7d077178f38bca7809 Mon Sep 17 00:00:00 2001
From: Justin Zhao <justinxzhao@gmail.com>
Date: Tue, 17 Oct 2023 20:51:13 +0000
Subject: [PATCH] Simplify how we set pad token and pad token ID.

---
 ludwig/utils/tokenizers.py | 52 +++++++++++---------------------------
 1 file changed, 15 insertions(+), 37 deletions(-)
diff --git a/ludwig/utils/tokenizers.py b/ludwig/utils/tokenizers.py
index 5c585d4ecaf..e7bce136735 100644
--- a/ludwig/utils/tokenizers.py
+++ b/ludwig/utils/tokenizers.py
@@ -840,43 +840,21 @@ def get_unk_token(self) -> str:
         return self.tokenizer.unk_token
 
     def _set_pad_token(self) -> None:
-        """Sets the pad token and pad token ID for the tokenizer."""
-
-        from transformers import (
-            CodeLlamaTokenizer,
-            CodeLlamaTokenizerFast,
-            GPT2Tokenizer,
-            GPT2TokenizerFast,
-            LlamaTokenizer,
-            LlamaTokenizerFast,
-        )
-
-        # Tokenizers might have the pad token id attribute since they tend to use the same base class, but
-        # it can be set to None so we check for this explicitly.
-        if hasattr(self.tokenizer, "pad_token_id") and self.tokenizer.pad_token_id is not None:
-            return
-
-        # HACK(geoffrey): gpt2 has no pad token. Recommendation is to use eos token instead.
-        # https://github.com/huggingface/transformers/issues/2630#issuecomment-1290809338
-        # https://github.com/huggingface/transformers/issues/2648#issuecomment-616177044
-        if any(
-            isinstance(self.tokenizer, t)
-            for t in [
-                GPT2Tokenizer,
-                GPT2TokenizerFast,
-                LlamaTokenizer,
-                LlamaTokenizerFast,
-                CodeLlamaTokenizer,
-                CodeLlamaTokenizerFast,
-            ]
-        ):
-            if hasattr(self.tokenizer, "eos_token") and self.tokenizer.eos_token is not None:
-                logger.warning("No padding token id found. Using eos_token as pad_token.")
-                self.tokenizer.pad_token = self.tokenizer.eos_token
-                self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-
-        # Incase any HF tokenizer does not have pad token ID, just default to using 0
-        # as the pad_token_id.
+        """Sets the pad token and pad token ID for the tokenizer.
+        If there is no pad token, then set one by default.
+        If there is no pad token index, then set it to 0.
+        Notes:
+        - (geoffrey): gpt2 has no pad token. Recommendation is to use eos token instead.
+            - https://github.com/huggingface/transformers/issues/2630#issuecomment-1290809338
+            - https://github.com/huggingface/transformers/issues/2648#issuecomment-616177044
+        - (Justin): Using the EOS token in place of the pad token causes an issue with HF model.generate() when
+            there are multiple examples in the batch.
+            - https://github.com/facebookresearch/llama/issues/380#issuecomment-1716832417
+            - Recommendation is to set a separate '[PAD]' or '<pad>' token.
+        """
+        if self.tokenizer.pad_token is None:
+            logger.warning("No padding token found. Using '[PAD]' as the pad token.")
+            self.tokenizer.pad_token = "[PAD]"
         if self.tokenizer.pad_token_id is None:
             logger.warning("No padding token id found. Using 0 as pad token id.")
             self.tokenizer.pad_token_id = 0