diff --git a/ludwig/distributed/deepspeed.py b/ludwig/distributed/deepspeed.py index 12986f2afdb..9b3a04b5135 100644 --- a/ludwig/distributed/deepspeed.py +++ b/ludwig/distributed/deepspeed.py @@ -221,7 +221,14 @@ def load(self, save_path: str, device: Optional[torch.device] = None) -> bool: https://deepspeed.readthedocs.io/en/latest/model-checkpointing.html#loading-training-checkpoints """ - _, client_state = self.model.load_checkpoint(save_path, load_lr_scheduler_states=False) + # NOTE(geoffrey): `load_module_strict=False` because this code path is frequently used to load models trained + # using adapter-based fine-tuning, where the checkpoints only contain the adapter weights, and not the full + # model weights. This may lead to silent, unexpected behavior for resuming full model fine-tuning, + # where all the model weights *must* be loaded in. + # TODO(geoffrey): Add a boolean arg to function to control load_module_strict behavior. + _, client_state = self.model.load_checkpoint( + save_path, load_lr_scheduler_states=False, load_module_strict=False + ) self.global_step = self._get_global_step(client_state, save_path) if self.scheduler is not None and "scheduler_state" in client_state: self.scheduler.load_state_dict(client_state["scheduler_state"])