fix: Disable saving model checkpoints if gradient accumulation is dis…

…abled
alexandrainst · Oct 24, 2024 · 126b148 · 126b148
1 parent 4ef2be8
commit 126b148
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 1 deletion.
diff --git a/config/asr_finetuning.yaml b/config/asr_finetuning.yaml
@@ -54,7 +54,7 @@ wandb_group: default
 wandb_name: ${model_id}
 resume_from_checkpoint: false
 ignore_data_skip: false
-save_total_limit: 1
+save_total_limit: 0  # Will automatically be set to >=1 if `early_stopping` is enabled
 
 # Optimisation parameters
 learning_rate: 1e-4

diff --git a/src/coral/wav2vec2.py b/src/coral/wav2vec2.py
@@ -173,6 +173,9 @@ def load_training_arguments(self) -> TrainingArguments:
                 if self.is_main_process:
                     logger.info("Mixed precision training with FP16 enabled.")
 
+        if self.config.early_stopping:
+            self.config.save_total_limit = max(self.config.save_total_limit, 1)
+
         args = TrainingArguments(
             output_dir=self.config.model_dir,
             hub_model_id=f"{self.config.hub_organisation}/{self.config.model_id}",

diff --git a/src/coral/whisper.py b/src/coral/whisper.py
@@ -156,6 +156,9 @@ def load_training_arguments(self) -> TrainingArguments:
                 if self.is_main_process:
                     logger.info("Mixed precision training with FP16 enabled.")
 
+        if self.config.early_stopping:
+            self.config.save_total_limit = max(self.config.save_total_limit, 1)
+
         args = Seq2SeqTrainingArguments(
             output_dir=self.config.model_dir,
             hub_model_id=f"{self.config.hub_organisation}/{self.config.model_id}",
@@ -172,6 +175,7 @@ def load_training_arguments(self) -> TrainingArguments:
             eval_strategy="steps",
             eval_steps=self.config.eval_steps,
             save_steps=self.config.save_steps,
+            save_strategy="no" if self.config.save_total_limit == 0 else "steps",
             logging_steps=self.config.logging_steps,
             length_column_name="input_length",
             gradient_checkpointing=True,