From 73bf82f84d0999726192471970c4c91439ef7d7a Mon Sep 17 00:00:00 2001 From: Justin Zhao Date: Fri, 20 Oct 2023 16:20:27 -0400 Subject: [PATCH 1/4] Apply HF fallbacks to all from_pretrained() object initializations. --- ludwig/config_validation/checks.py | 3 +- ludwig/encoders/image/base.py | 3 +- ludwig/encoders/text_encoders.py | 40 ++++++++-------- ludwig/models/llm.py | 23 ++++++---- ludwig/schema/llms/base_model.py | 3 +- ludwig/schema/model_types/utils.py | 3 +- ludwig/utils/hf_utils.py | 51 +++++++-------------- ludwig/utils/llm_utils.py | 47 +------------------ ludwig/utils/tokenizers.py | 9 ++-- tests/ludwig/encoders/test_text_encoders.py | 6 +-- tests/ludwig/utils/test_hf_utils.py | 45 +++++++++++++++--- tests/ludwig/utils/test_llm_utils.py | 24 ++-------- tests/ludwig/utils/test_tokenizers.py | 16 ++++++- 13 files changed, 124 insertions(+), 149 deletions(-) diff --git a/ludwig/config_validation/checks.py b/ludwig/config_validation/checks.py index 871d4245b1b..df9b25256cf 100644 --- a/ludwig/config_validation/checks.py +++ b/ludwig/config_validation/checks.py @@ -25,6 +25,7 @@ VECTOR, ) from ludwig.error import ConfigValidationError +from ludwig.utils.hf_utils import load_pretrained_hf_class_with_hub_fallback from ludwig.utils.metric_utils import get_feature_to_metric_names_map_from_feature_collection from ludwig.utils.misc_utils import merge_dict @@ -594,7 +595,7 @@ def check_llm_finetuning_adaption_prompt_parameters(config: "ModelConfig"): def _get_llm_model_config(model_name: str) -> AutoConfig: """Returns the LLM model config.""" - return AutoConfig.from_pretrained(model_name) + return load_pretrained_hf_class_with_hub_fallback(AutoConfig, model_name)[0] # TODO(geoffrey, arnav): uncomment this when we have reconciled the config with the backend kwarg in api.py diff --git a/ludwig/encoders/image/base.py b/ludwig/encoders/image/base.py index a26a741b085..25e734497c7 100644 --- a/ludwig/encoders/image/base.py +++ b/ludwig/encoders/image/base.py @@ -33,6 +33,7 @@ Stacked2DCNNConfig, ViTConfig, ) +from ludwig.utils.hf_utils import load_pretrained_hf_class_with_hub_fallback from ludwig.utils.torch_utils import FreezeModule logger = logging.getLogger(__name__) @@ -382,7 +383,7 @@ def __init__( self._input_shape = (in_channels, img_height, img_width) if use_pretrained and not saved_weights_in_checkpoint: - transformer = ViTModel.from_pretrained(pretrained_model) + transformer = load_pretrained_hf_class_with_hub_fallback(ViTModel, pretrained_model) else: config = ViTConfig( image_size=img_height, diff --git a/ludwig/encoders/text_encoders.py b/ludwig/encoders/text_encoders.py index 69571dbeacd..39de7b1eb5b 100644 --- a/ludwig/encoders/text_encoders.py +++ b/ludwig/encoders/text_encoders.py @@ -52,7 +52,7 @@ XLNetConfig, ) from ludwig.schema.llms.peft import BaseAdapterConfig -from ludwig.utils.hf_utils import load_pretrained_hf_model_with_hub_fallback +from ludwig.utils.hf_utils import load_pretrained_hf_class_with_hub_fallback from ludwig.utils.torch_utils import FreezeModule if TYPE_CHECKING: @@ -179,7 +179,7 @@ def __init__( hf_config_params = {k: v for k, v in kwargs.items() if k in schema_cls.get_hf_config_param_names()} if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( model_cls, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -302,7 +302,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( AlbertModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -428,7 +428,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( MT5EncoderModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -524,7 +524,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( XLMRobertaModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -646,7 +646,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( BertModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -793,7 +793,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( XLMModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -900,7 +900,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( OpenAIGPTModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -1007,7 +1007,7 @@ def __init__( if use_pretrained: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( GPT2Model, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -1110,7 +1110,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( RobertaModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -1243,7 +1243,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( TransfoXLModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -1371,7 +1371,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( XLNetModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -1475,7 +1475,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( DistilBertModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -1585,7 +1585,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( CTRLModel, pretrained_model_name_or_path, **pretrained_kwargs ) self.vocab_size = transformer.config.vocab_size @@ -1698,7 +1698,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( CamembertModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -1812,7 +1812,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( T5Model, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -1949,7 +1949,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( FlaubertModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -2066,7 +2066,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( ElectraModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -2159,7 +2159,7 @@ def __init__( if use_pretrained and not saved_weights_in_checkpoint: pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( LongformerModel, pretrained_model_name_or_path, **pretrained_kwargs ) else: @@ -2243,7 +2243,7 @@ def __init__( from transformers import AutoModel pretrained_kwargs = pretrained_kwargs or {} - transformer, _ = load_pretrained_hf_model_with_hub_fallback( + transformer, _ = load_pretrained_hf_class_with_hub_fallback( AutoModel, pretrained_model_name_or_path, **pretrained_kwargs ) self._maybe_resize_token_embeddings(transformer, vocab_size) diff --git a/ludwig/models/llm.py b/ludwig/models/llm.py index e81c6d4bebf..c51ee667b08 100644 --- a/ludwig/models/llm.py +++ b/ludwig/models/llm.py @@ -6,7 +6,7 @@ import numpy as np import torch -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, PreTrainedModel +from transformers import AutoConfig, AutoModelForCausalLM, GenerationConfig, PreTrainedModel from ludwig.constants import IGNORE_INDEX_TOKEN_ID, LOGITS, MODEL_LLM, PREDICTIONS, TEXT from ludwig.features.base_feature import ModuleWrapper, OutputFeature @@ -19,6 +19,7 @@ from ludwig.utils.augmentation_utils import AugmentationPipelines from ludwig.utils.data_utils import clear_data_cache from ludwig.utils.error_handling_utils import default_retry +from ludwig.utils.hf_utils import load_pretrained_hf_class_with_hub_fallback from ludwig.utils.llm_utils import ( add_left_padding, generate_merged_ids, @@ -26,10 +27,10 @@ pad_target_tensor_for_fine_tuning, realign_target_and_prediction_tensors_for_inference, remove_left_padding, - set_pad_token, ) from ludwig.utils.logging_utils import log_once from ludwig.utils.output_feature_utils import set_output_feature_tensor +from ludwig.utils.tokenizers import HFTokenizer from ludwig.utils.torch_utils import reg_loss logger = logging.getLogger(__name__) @@ -101,7 +102,9 @@ def load_pretrained_from_config( logger.info("Loading large language model...") pretrained_model_name_or_path = weights_save_path or config_obj.base_model - model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **load_kwargs) + model: PreTrainedModel = load_pretrained_hf_class_with_hub_fallback( + AutoModelForCausalLM, pretrained_model_name_or_path, **load_kwargs + ) return model @@ -123,7 +126,7 @@ def __init__( self._random_seed = random_seed self.model_name = self.config_obj.base_model - self.model_config = AutoConfig.from_pretrained(self.config_obj.base_model) + self.model_config = load_pretrained_hf_class_with_hub_fallback(AutoConfig, self.config_obj.base_model) self.model = load_pretrained_from_config(self.config_obj, model_config=self.model_config) self.curr_device = next(self.model.parameters()).device @@ -144,8 +147,8 @@ def __init__( self.global_max_sequence_length = self.context_len # Initialize tokenizer - self.tokenizer = AutoTokenizer.from_pretrained(self.config_obj.base_model) - set_pad_token(self.tokenizer) + ludwig_tokenizer = HFTokenizer(self.config_obj.base_model) + self.tokenizer = ludwig_tokenizer.tokenizer self._set_generation_config(self.config_obj.generation.to_dict()) @@ -300,7 +303,8 @@ def to_device(self, device): if self.config_obj.adapter: from peft import PeftModel - self.model = AutoModelForCausalLM.from_pretrained( + self.model = load_pretrained_hf_class_with_hub_fallback( + AutoModelForCausalLM, self.model_name, **model_kwargs, ) @@ -310,7 +314,8 @@ def to_device(self, device): torch_dtype=torch.float16, ) else: - self.model = AutoModelForCausalLM.from_pretrained( + self.model = load_pretrained_hf_class_with_hub_fallback( + AutoModelForCausalLM, tmpdir, **model_kwargs, ) @@ -693,7 +698,7 @@ def load(self, save_path): # Unwrap and reload PeftModel self.model = self.model.base_model - self.model = PeftModel.from_pretrained(self.model, weights_save_path) + self.model = load_pretrained_hf_class_with_hub_fallback(PeftModel, self.model, weights_save_path) elif self.config_obj.trainer.type != "none": self.model = load_pretrained_from_config( self.config_obj, model_config=self.model_config, weights_save_path=weights_save_path diff --git a/ludwig/schema/llms/base_model.py b/ludwig/schema/llms/base_model.py index 1eca08e13ef..a395ae8e667 100644 --- a/ludwig/schema/llms/base_model.py +++ b/ludwig/schema/llms/base_model.py @@ -9,6 +9,7 @@ from ludwig.error import ConfigValidationError from ludwig.schema.metadata import LLM_METADATA from ludwig.schema.metadata.parameter_metadata import convert_metadata_to_json +from ludwig.utils.hf_utils import load_pretrained_hf_class_with_hub_fallback # Maps a preset LLM name to the full slash-delimited HF path. If the user chooses a preset LLM, the preset LLM name is # replaced with the full slash-delimited HF path using this map, after JSON validation but before config object @@ -55,7 +56,7 @@ def validate(model_name: str): if os.path.isdir(model_name): return model_name try: - AutoConfig.from_pretrained(model_name) + load_pretrained_hf_class_with_hub_fallback(AutoConfig, model_name) return model_name except OSError: raise ConfigValidationError( diff --git a/ludwig/schema/model_types/utils.py b/ludwig/schema/model_types/utils.py index b8550d06838..8180cbe551e 100644 --- a/ludwig/schema/model_types/utils.py +++ b/ludwig/schema/model_types/utils.py @@ -34,6 +34,7 @@ from ludwig.schema.trainer import ECDTrainerConfig from ludwig.types import HyperoptConfigDict, ModelConfigDict from ludwig.utils.data_utils import get_sanitized_feature_name +from ludwig.utils.hf_utils import load_pretrained_hf_class_with_hub_fallback from ludwig.utils.llm_utils import get_context_len if TYPE_CHECKING: @@ -370,7 +371,7 @@ def _get_maximum_possible_sequence_length(config: "ModelConfig", default_max_seq # we should fall back to the window size of the pretrained model. By this point, because of schema validation # checks, we know that the base_model exists so we can safely grab the base model's config. # TODO (Arnav): Figure out how to factor in rope scaling factor into this calculation. - model_config = AutoConfig.from_pretrained(config.base_model) + model_config = load_pretrained_hf_class_with_hub_fallback(AutoConfig, config.base_model) max_possible_sequence_length = get_context_len(model_config) # Artifically leave a buffer of half the total model window size to trade off # runtime while likely covering a majority of the max sequence length. diff --git a/ludwig/utils/hf_utils.py b/ludwig/utils/hf_utils.py index 3280d7390a6..63cee47800f 100644 --- a/ludwig/utils/hf_utils.py +++ b/ludwig/utils/hf_utils.py @@ -4,8 +4,7 @@ from os import PathLike from typing import Optional, Tuple, Type, Union -from transformers import AutoTokenizer, PreTrainedModel -from transformers.tokenization_utils import PreTrainedTokenizer +from transformers import PreTrainedModel from ludwig.api_annotations import DeveloperAPI from ludwig.utils.error_handling_utils import default_retry @@ -15,53 +14,38 @@ @default_retry() -def load_pretrained_hf_model_from_hub( - model_class: Type, +def _load_pretrained_hf_class_from_hub( + hf_class: Type, pretrained_model_name_or_path: Optional[Union[str, PathLike]], **pretrained_kwargs, ) -> PreTrainedModel: - """Download a HuggingFace model. + """Download a HuggingFace artifact (model, tokenizer, config). Downloads a model from the HuggingFace zoo with retry on failure. Args: - model_class: Class of the model to download. + hf_class: Class of the model to download. pretrained_model_name_or_path: Name of the model to download. pretrained_kwargs: Additional arguments to pass to the model constructor. Returns: The pretrained model object. """ - return model_class.from_pretrained(pretrained_model_name_or_path, **pretrained_kwargs) + return hf_class.from_pretrained(pretrained_model_name_or_path, **pretrained_kwargs) -@default_retry() -def load_pretrained_hf_tokenizer( - pretrained_model_name_or_path: Optional[Union[str, PathLike]], **pretrained_kwargs -) -> PreTrainedTokenizer: - """Download a HuggingFace tokenizer. - - Args: - pretrained_model_name_or_path: Name of the tokenizer to download. - pretrained_kwargs: Additional arguments to pass to the tokenizer constructor. - Returns: - The pretrained tokenizer object. - """ - return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **pretrained_kwargs) - - -def _load_pretrained_hf_model_from_dir( - model_class: Type, +def _load_pretrained_hf_class_from_dir( + hf_class: Type, pretrained_model_name_or_path: Optional[Union[str, PathLike]], **pretrained_kwargs, ) -> PreTrainedModel: """Downloads a model to a local temporary directory, and Loads a pretrained HF model from a local directory.""" with tempfile.TemporaryDirectory() as tmpdir: download(pretrained_model_name_or_path, tmpdir) - return model_class.from_pretrained(tmpdir, **pretrained_kwargs) + return hf_class.from_pretrained(tmpdir, **pretrained_kwargs) @DeveloperAPI -def load_pretrained_hf_model_with_hub_fallback( - model_class: Type, +def load_pretrained_hf_class_with_hub_fallback( + hf_class: Type, pretrained_model_name_or_path: Optional[Union[str, PathLike]], **pretrained_kwargs, ) -> Tuple[PreTrainedModel, bool]: @@ -88,22 +72,19 @@ def load_pretrained_hf_model_with_hub_fallback( """ pretrained_models_dir = os.environ.get("LUDWIG_PRETRAINED_MODELS_DIR") if pretrained_models_dir: + logger.info("LUDWIG_PRETRAINED_MODELS_DIR was set. Attempting to load HF artifact from S3.") pretrained_model_path = os.path.join(pretrained_models_dir, pretrained_model_name_or_path) if path_exists(pretrained_model_path): try: - logger.info( - f"Found existing pretrained model artifact {pretrained_model_name_or_path} in directory " - f"{pretrained_models_dir}. Downloading." - ) return ( - _load_pretrained_hf_model_from_dir(model_class, pretrained_model_path, **pretrained_kwargs), + _load_pretrained_hf_class_from_dir(hf_class, pretrained_model_path, **pretrained_kwargs), False, ) except Exception as e: logger.warning( - f"Failed to download pretrained model from {pretrained_models_dir} with error {e}. " - "Falling back to HuggingFace model hub." + f"Failed to download pretrained artifact for hf class {hf_class} from {pretrained_models_dir} with " + f"error {e}. Falling back to HuggingFace model hub." ) # Fallback to HF hub. - return load_pretrained_hf_model_from_hub(model_class, pretrained_model_name_or_path, **pretrained_kwargs), True + return _load_pretrained_hf_class_from_hub(hf_class, pretrained_model_name_or_path, **pretrained_kwargs), True diff --git a/ludwig/utils/llm_utils.py b/ludwig/utils/llm_utils.py index 1dd9d1df85a..f2d437e3cd0 100644 --- a/ludwig/utils/llm_utils.py +++ b/ludwig/utils/llm_utils.py @@ -4,17 +4,7 @@ import torch import torch.nn.functional as F from bitsandbytes.nn.modules import Embedding -from transformers import ( - AutoConfig, - AutoModelForCausalLM, - CodeLlamaTokenizer, - CodeLlamaTokenizerFast, - GPT2Tokenizer, - GPT2TokenizerFast, - LlamaTokenizer, - LlamaTokenizerFast, - PreTrainedTokenizer, -) +from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedTokenizer from ludwig.constants import IGNORE_INDEX_TOKEN_ID, LOGITS, PREDICTIONS, PROBABILITIES from ludwig.schema.trainer import LLMTrainerConfig @@ -26,41 +16,6 @@ FALLBACK_CONTEXT_LEN = 2048 -def set_pad_token(tokenizer: PreTrainedTokenizer): - """Sets the pad token for the tokenizer if it is not already set. - - Args: - tokenizer (PreTrainedTokenizer): The tokenizer. - - Example: - >>> from transformers import GPT2Tokenizer, GPT2TokenizerFast, LlamaTokenizer, LlamaTokenizerFast # noqa - >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2') - >>> set_pad_token(tokenizer) - """ - # Tokenizers might have the pad token id attribute since they tend to use the same base class, but - # it can be set to None so we check for this explicitly. - if hasattr(tokenizer, "pad_token_id") and tokenizer.pad_token_id is not None: - return - - # HACK(Arnav): gpt, gpt2 and llama tokenizers had no pad tokens. - # These recommend using eos tokens instead - # https://github.com/huggingface/transformers/issues/2648#issuecomment-616177044 - # https://github.com/huggingface/transformers/issues/2630#issuecomment-1290809338 - if any( - isinstance(tokenizer, t) - for t in [ - GPT2Tokenizer, - GPT2TokenizerFast, - LlamaTokenizer, - LlamaTokenizerFast, - CodeLlamaTokenizer, - CodeLlamaTokenizerFast, - ] - ): - tokenizer.pad_token = tokenizer.eos_token - tokenizer.pad_token_id = tokenizer.eos_token_id - - def get_context_len(model_config: AutoConfig): """Determines the maximum length of the context (input + output tokens) based on the provided model configuration. diff --git a/ludwig/utils/tokenizers.py b/ludwig/utils/tokenizers.py index f39afd596e2..9c0f1237bc8 100644 --- a/ludwig/utils/tokenizers.py +++ b/ludwig/utils/tokenizers.py @@ -19,10 +19,11 @@ import torch import torchtext +from transformers import AutoTokenizer from ludwig.constants import PADDING_SYMBOL, UNKNOWN_SYMBOL from ludwig.utils.data_utils import load_json -from ludwig.utils.hf_utils import load_pretrained_hf_tokenizer +from ludwig.utils.hf_utils import load_pretrained_hf_class_with_hub_fallback from ludwig.utils.nlp_utils import load_nlp_pipeline, process_text logger = logging.getLogger(__name__) @@ -824,7 +825,9 @@ class HFTokenizer(BaseTokenizer): def __init__(self, pretrained_model_name_or_path, **kwargs): super().__init__() self.pretrained_model_name_or_path = pretrained_model_name_or_path - self.tokenizer = load_pretrained_hf_tokenizer(self.pretrained_model_name_or_path, **kwargs) + self.tokenizer, _ = load_pretrained_hf_class_with_hub_fallback( + AutoTokenizer, self.pretrained_model_name_or_path, **kwargs + ) self._set_pad_token() def __call__(self, text): @@ -1219,7 +1222,7 @@ def get_hf_tokenizer(pretrained_model_name_or_path, **kwargs): hf_name = pretrained_model_name_or_path # use_fast=False to leverage python class inheritance # cannot tokenize HF tokenizers directly because HF lacks strict typing and List[str] cannot be traced - hf_tokenizer = load_pretrained_hf_tokenizer(hf_name, use_fast=False) + hf_tokenizer, _ = load_pretrained_hf_class_with_hub_fallback(AutoTokenizer, hf_name, use_fast=False) torchtext_tokenizer = None if "bert" in TORCHSCRIPT_COMPATIBLE_TOKENIZERS and any( diff --git a/tests/ludwig/encoders/test_text_encoders.py b/tests/ludwig/encoders/test_text_encoders.py index 1e26799ebdc..cb96778b1df 100644 --- a/tests/ludwig/encoders/test_text_encoders.py +++ b/tests/ludwig/encoders/test_text_encoders.py @@ -103,7 +103,7 @@ def test_hf_ludwig_model_e2e(tmpdir, csv_filename, encoder_name): model = LudwigModel(config=config, backend=LocalTestBackend()) with mock.patch( - "ludwig.encoders.text_encoders.load_pretrained_hf_model_with_hub_fallback", + "ludwig.encoders.text_encoders.load_pretrained_hf_class_with_hub_fallback", side_effect=_load_pretrained_hf_model_no_weights, ): # Validates that the defaults associated with the encoder are compatible with Ludwig training. @@ -168,7 +168,7 @@ def test_hf_ludwig_model_reduce_options(tmpdir, csv_filename, encoder_name, redu # Validates that the defaults associated with the encoder are compatible with Ludwig training. with mock.patch( - "ludwig.encoders.text_encoders.load_pretrained_hf_model_with_hub_fallback", + "ludwig.encoders.text_encoders.load_pretrained_hf_class_with_hub_fallback", side_effect=_load_pretrained_hf_model_no_weights, ): model.train( @@ -223,7 +223,7 @@ def test_hf_ludwig_model_auto_transformers(tmpdir, csv_filename, pretrained_mode # Validates that the defaults associated with the encoder are compatible with Ludwig training. with mock.patch( - "ludwig.encoders.text_encoders.load_pretrained_hf_model_with_hub_fallback", + "ludwig.encoders.text_encoders.load_pretrained_hf_class_with_hub_fallback", side_effect=_load_pretrained_hf_model_no_weights, ): model.train(dataset=rel_path, output_directory=tmpdir) diff --git a/tests/ludwig/utils/test_hf_utils.py b/tests/ludwig/utils/test_hf_utils.py index 44158947303..2260827fe1f 100644 --- a/tests/ludwig/utils/test_hf_utils.py +++ b/tests/ludwig/utils/test_hf_utils.py @@ -2,10 +2,10 @@ from typing import Type import pytest -from transformers import AlbertModel, BertModel, BertTokenizer +from transformers import AlbertModel, AutoConfig, AutoModelForCausalLM, AutoTokenizer, BertModel, BertTokenizer from ludwig.encoders.text_encoders import ALBERTEncoder, BERTEncoder -from ludwig.utils.hf_utils import load_pretrained_hf_model_from_hub, load_pretrained_hf_model_with_hub_fallback +from ludwig.utils.hf_utils import _load_pretrained_hf_class_from_hub, load_pretrained_hf_class_with_hub_fallback @pytest.mark.parametrize( @@ -19,7 +19,7 @@ def test_load_pretrained_hf_model_from_hub(model: Type, name: str, tmpdir: os.Pa """Ensure that the HF models used in ludwig download correctly.""" cache_dir = os.path.join(tmpdir, name.replace(os.path.sep, "_") if name else str(model.__name__)) os.makedirs(cache_dir, exist_ok=True) - loaded_model = load_pretrained_hf_model_from_hub(model, name, cache_dir=cache_dir, force_download=True) + loaded_model = _load_pretrained_hf_class_from_hub(model, name, cache_dir=cache_dir, force_download=True) assert isinstance(loaded_model, model) assert os.listdir(cache_dir) @@ -27,20 +27,51 @@ def test_load_pretrained_hf_model_from_hub(model: Type, name: str, tmpdir: os.Pa def test_load_pretrained_hf_model_with_hub_fallback(tmpdir): """Ensure that the HF models used in ludwig download correctly with S3 or hub fallback.""" # Don't set env var. - _, used_fallback = load_pretrained_hf_model_with_hub_fallback(AlbertModel, ALBERTEncoder.DEFAULT_MODEL_NAME) + _, used_fallback = load_pretrained_hf_class_with_hub_fallback(AlbertModel, ALBERTEncoder.DEFAULT_MODEL_NAME) assert used_fallback # Download the model, load it from tmpdir, and set env var. - load_pretrained_hf_model_from_hub(AlbertModel, "albert-base-v2").save_pretrained( + _load_pretrained_hf_class_from_hub(AlbertModel, "albert-base-v2").save_pretrained( os.path.join(tmpdir, "albert-base-v2") ) os.environ["LUDWIG_PRETRAINED_MODELS_DIR"] = f"file://{tmpdir}" # Needs to be an absolute path. - _, used_fallback = load_pretrained_hf_model_with_hub_fallback(AlbertModel, ALBERTEncoder.DEFAULT_MODEL_NAME) + _, used_fallback = load_pretrained_hf_class_with_hub_fallback(AlbertModel, ALBERTEncoder.DEFAULT_MODEL_NAME) assert not used_fallback # Fallback is used for a model that doesn't exist in models directory. - _, used_fallback = load_pretrained_hf_model_with_hub_fallback(BertModel, BERTEncoder.DEFAULT_MODEL_NAME) + _, used_fallback = load_pretrained_hf_class_with_hub_fallback(BertModel, BERTEncoder.DEFAULT_MODEL_NAME) assert used_fallback # Clean up. del os.environ["LUDWIG_PRETRAINED_MODELS_DIR"] + + +def test_load_pretrained_hf_tokenizer_with_hub_fallback(tmpdir): + """Ensure that the HF models used in ludwig download correctly with S3 or hub fallback.""" + _, used_fallback = load_pretrained_hf_class_with_hub_fallback( + AutoTokenizer, "hf-internal-testing/tiny-random-GPTJForCausalLM" + ) + assert used_fallback + + _, used_fallback = load_pretrained_hf_class_with_hub_fallback( + AutoConfig, "hf-internal-testing/tiny-random-GPTJForCausalLM" + ) + assert used_fallback + + +def test_load_pretrained_hf_causal_lm_with_hub_fallback(tmpdir): + """Ensure that the HF models used in ludwig download correctly with S3 or hub fallback.""" + _, used_fallback = load_pretrained_hf_class_with_hub_fallback( + AutoModelForCausalLM, "hf-internal-testing/tiny-random-GPTJForCausalLM" + ) + + # Download the model, load it from tmpdir, and set env var. + _load_pretrained_hf_class_from_hub( + AutoModelForCausalLM, "hf-internal-testing/tiny-random-GPTJForCausalLM" + ).save_pretrained(os.path.join(tmpdir, "hf-internal-testing/tiny-random-GPTJForCausalLM")) + + os.environ["LUDWIG_PRETRAINED_MODELS_DIR"] = f"file://{tmpdir}" # Needs to be an absolute path. + _, used_fallback = load_pretrained_hf_class_with_hub_fallback( + AutoModelForCausalLM, "hf-internal-testing/tiny-random-GPTJForCausalLM" + ) + assert not used_fallback diff --git a/tests/ludwig/utils/test_llm_utils.py b/tests/ludwig/utils/test_llm_utils.py index d79264bf26a..17c15375db3 100644 --- a/tests/ludwig/utils/test_llm_utils.py +++ b/tests/ludwig/utils/test_llm_utils.py @@ -1,6 +1,6 @@ import pytest import torch -from transformers import AutoConfig, AutoTokenizer +from transformers import AutoConfig from ludwig.constants import LOGITS, PREDICTIONS, PROBABILITIES from ludwig.utils.llm_utils import ( @@ -14,8 +14,8 @@ pad_target_tensor_for_fine_tuning, realign_target_and_prediction_tensors_for_inference, remove_left_padding, - set_pad_token, ) +from ludwig.utils.tokenizers import HFTokenizer pytestmark = [pytest.mark.llm] @@ -26,9 +26,7 @@ @pytest.fixture def tokenizer(): - tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL_NAME) - set_pad_token(tokenizer) - return tokenizer + return HFTokenizer(TEST_MODEL_NAME) @pytest.fixture @@ -43,22 +41,6 @@ def target_ids(): return torch.tensor([[9, 10, 11], [12, 13, 14]]) -def test_set_pad_token_doesnt_exist(): - tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=False) - assert tokenizer.pad_token_id is None - - set_pad_token(tokenizer) - assert tokenizer.pad_token_id == 50256 - - -def test_set_pad_token_already_exists(): - tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL_NAME, use_fast=False) - assert tokenizer.pad_token_id == 1 - - set_pad_token(tokenizer) - assert tokenizer.pad_token_id == 1 - - class TestSetContextLen: def test_max_sequence_length(self): # Test when 'max_sequence_length' is present in the model configuration diff --git a/tests/ludwig/utils/test_tokenizers.py b/tests/ludwig/utils/test_tokenizers.py index 82f6d86bdff..53d18d59db1 100644 --- a/tests/ludwig/utils/test_tokenizers.py +++ b/tests/ludwig/utils/test_tokenizers.py @@ -4,7 +4,7 @@ import torch import torchtext -from ludwig.utils.tokenizers import EnglishLemmatizeFilterTokenizer, NgramTokenizer, StringSplitTokenizer +from ludwig.utils.tokenizers import EnglishLemmatizeFilterTokenizer, HFTokenizer, NgramTokenizer, StringSplitTokenizer TORCHTEXT_0_14_0_HF_NAMES = [ "bert-base-uncased", @@ -16,6 +16,10 @@ "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12", # Community model ] +# Pad token ID is 1 for OPT even though it uses the GPT2 tokenizer +# BOS token ID is 2 +TEST_MODEL_NAME = "hf-internal-testing/tiny-random-OPTForCausalLM" + @pytest.mark.parametrize( "pretrained_model_name_or_path", @@ -85,3 +89,13 @@ def test_english_lemmatize_filter_tokenizer(): tokenizer = EnglishLemmatizeFilterTokenizer() tokens = tokenizer(inputs) assert len(tokens) > 0 + + +def test_set_pad_token_doesnt_exist(): + tokenizer = HFTokenizer("gpt2", use_fast=False) + assert tokenizer.tokenizer.pad_token_id == 50256 + + +def test_set_pad_token_already_exists(): + tokenizer = HFTokenizer(TEST_MODEL_NAME, use_fast=False) + assert tokenizer.tokenizer.pad_token_id == 1 From e42d3e31fb5a29e81723ea9a4ddc764675b97f29 Mon Sep 17 00:00:00 2001 From: Justin Zhao Date: Fri, 27 Oct 2023 15:51:40 -0700 Subject: [PATCH 2/4] Fix tests. --- ludwig/encoders/image/base.py | 2 +- tests/ludwig/utils/test_llm_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ludwig/encoders/image/base.py b/ludwig/encoders/image/base.py index 25e734497c7..eeeff609840 100644 --- a/ludwig/encoders/image/base.py +++ b/ludwig/encoders/image/base.py @@ -383,7 +383,7 @@ def __init__( self._input_shape = (in_channels, img_height, img_width) if use_pretrained and not saved_weights_in_checkpoint: - transformer = load_pretrained_hf_class_with_hub_fallback(ViTModel, pretrained_model) + transformer, _ = load_pretrained_hf_class_with_hub_fallback(ViTModel, pretrained_model) else: config = ViTConfig( image_size=img_height, diff --git a/tests/ludwig/utils/test_llm_utils.py b/tests/ludwig/utils/test_llm_utils.py index 17c15375db3..a02198ee3b0 100644 --- a/tests/ludwig/utils/test_llm_utils.py +++ b/tests/ludwig/utils/test_llm_utils.py @@ -26,7 +26,7 @@ @pytest.fixture def tokenizer(): - return HFTokenizer(TEST_MODEL_NAME) + return HFTokenizer(TEST_MODEL_NAME).tokenizer @pytest.fixture From b7ac42c0c1ac3aeb7fabcc7d9af489014646576a Mon Sep 17 00:00:00 2001 From: Justin Zhao Date: Fri, 27 Oct 2023 16:09:14 -0700 Subject: [PATCH 3/4] Fix tests. --- ludwig/models/llm.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ludwig/models/llm.py b/ludwig/models/llm.py index c51ee667b08..2970e63e6ab 100644 --- a/ludwig/models/llm.py +++ b/ludwig/models/llm.py @@ -102,7 +102,7 @@ def load_pretrained_from_config( logger.info("Loading large language model...") pretrained_model_name_or_path = weights_save_path or config_obj.base_model - model: PreTrainedModel = load_pretrained_hf_class_with_hub_fallback( + model, _ = load_pretrained_hf_class_with_hub_fallback( AutoModelForCausalLM, pretrained_model_name_or_path, **load_kwargs ) return model @@ -126,7 +126,7 @@ def __init__( self._random_seed = random_seed self.model_name = self.config_obj.base_model - self.model_config = load_pretrained_hf_class_with_hub_fallback(AutoConfig, self.config_obj.base_model) + self.model_config, _ = load_pretrained_hf_class_with_hub_fallback(AutoConfig, self.config_obj.base_model) self.model = load_pretrained_from_config(self.config_obj, model_config=self.model_config) self.curr_device = next(self.model.parameters()).device @@ -303,7 +303,7 @@ def to_device(self, device): if self.config_obj.adapter: from peft import PeftModel - self.model = load_pretrained_hf_class_with_hub_fallback( + self.model, _ = load_pretrained_hf_class_with_hub_fallback( AutoModelForCausalLM, self.model_name, **model_kwargs, @@ -314,7 +314,7 @@ def to_device(self, device): torch_dtype=torch.float16, ) else: - self.model = load_pretrained_hf_class_with_hub_fallback( + self.model, _ = load_pretrained_hf_class_with_hub_fallback( AutoModelForCausalLM, tmpdir, **model_kwargs, @@ -698,7 +698,7 @@ def load(self, save_path): # Unwrap and reload PeftModel self.model = self.model.base_model - self.model = load_pretrained_hf_class_with_hub_fallback(PeftModel, self.model, weights_save_path) + self.model = PeftModel.from_pretrained(self.model, weights_save_path) elif self.config_obj.trainer.type != "none": self.model = load_pretrained_from_config( self.config_obj, model_config=self.model_config, weights_save_path=weights_save_path From d4d145f9c7551fd183d9a7076d81dcb090cdaccf Mon Sep 17 00:00:00 2001 From: Justin Zhao Date: Fri, 27 Oct 2023 16:48:22 -0700 Subject: [PATCH 4/4] Fix test --- ludwig/schema/model_types/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/schema/model_types/utils.py b/ludwig/schema/model_types/utils.py index 8180cbe551e..cd14413f3d3 100644 --- a/ludwig/schema/model_types/utils.py +++ b/ludwig/schema/model_types/utils.py @@ -371,7 +371,7 @@ def _get_maximum_possible_sequence_length(config: "ModelConfig", default_max_seq # we should fall back to the window size of the pretrained model. By this point, because of schema validation # checks, we know that the base_model exists so we can safely grab the base model's config. # TODO (Arnav): Figure out how to factor in rope scaling factor into this calculation. - model_config = load_pretrained_hf_class_with_hub_fallback(AutoConfig, config.base_model) + model_config, _ = load_pretrained_hf_class_with_hub_fallback(AutoConfig, config.base_model) max_possible_sequence_length = get_context_len(model_config) # Artifically leave a buffer of half the total model window size to trade off # runtime while likely covering a majority of the max sequence length.