Skip to content

Commit

Permalink
Simplify how we set pad token and pad token ID.
Browse files Browse the repository at this point in the history
  • Loading branch information
justinxzhao committed Oct 17, 2023
1 parent c711d0a commit 122a933
Showing 1 changed file with 15 additions and 37 deletions.
52 changes: 15 additions & 37 deletions ludwig/utils/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,43 +840,21 @@ def get_unk_token(self) -> str:
return self.tokenizer.unk_token

def _set_pad_token(self) -> None:
"""Sets the pad token and pad token ID for the tokenizer."""

from transformers import (
CodeLlamaTokenizer,
CodeLlamaTokenizerFast,
GPT2Tokenizer,
GPT2TokenizerFast,
LlamaTokenizer,
LlamaTokenizerFast,
)

# Tokenizers might have the pad token id attribute since they tend to use the same base class, but
# it can be set to None so we check for this explicitly.
if hasattr(self.tokenizer, "pad_token_id") and self.tokenizer.pad_token_id is not None:
return

# HACK(geoffrey): gpt2 has no pad token. Recommendation is to use eos token instead.
# https://github.com/huggingface/transformers/issues/2630#issuecomment-1290809338
# https://github.com/huggingface/transformers/issues/2648#issuecomment-616177044
if any(
isinstance(self.tokenizer, t)
for t in [
GPT2Tokenizer,
GPT2TokenizerFast,
LlamaTokenizer,
LlamaTokenizerFast,
CodeLlamaTokenizer,
CodeLlamaTokenizerFast,
]
):
if hasattr(self.tokenizer, "eos_token") and self.tokenizer.eos_token is not None:
logger.warning("No padding token id found. Using eos_token as pad_token.")
self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

# Incase any HF tokenizer does not have pad token ID, just default to using 0
# as the pad_token_id.
"""Sets the pad token and pad token ID for the tokenizer.
If there is no pad token, then set one by default.
If there is no pad token index, then set it to 0.
Notes:
- (geoffrey): gpt2 has no pad token. Recommendation is to use eos token instead.
- https://github.com/huggingface/transformers/issues/2630#issuecomment-1290809338
- https://github.com/huggingface/transformers/issues/2648#issuecomment-616177044
- (Justin): Using the EOS token in place of the pad token causes an issue with HF model.generate() when
there are multiple examples in the batch.
- https://github.com/facebookresearch/llama/issues/380#issuecomment-1716832417
- Recommendation is to set a separate '[PAD]' or '<pad>' token.
"""
if self.tokenizer.pad_token is None:
logger.warning("No padding token found. Using '[PAD]' as the pad token.")
self.tokenizer.pad_token = "[PAD]"
if self.tokenizer.pad_token_id is None:
logger.warning("No padding token id found. Using 0 as pad token id.")
self.tokenizer.pad_token_id = 0
Expand Down

0 comments on commit 122a933

Please sign in to comment.