Merge pull request foundation-model-stack#51 from foundation-model-st…

…ack/validation-loss-file feat: track validation loss in logs file
VassilisVassiliadis · Feb 20, 2024 · b9380e4 · b9380e4
2 parents 24e7385 + 1e817ca
commit b9380e4
Showing 1 changed file with 26 additions and 19 deletions.
diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
@@ -47,34 +47,41 @@ def __init__(self, logger):
         self.logger = logger
 
     def on_log(self, args, state, control, logs=None, **kwargs):
-        """Checks if this log contains keys of interest, e.g., los, and if so, creates
+        """Checks if this log contains keys of interest, e.g., loss, and if so, creates
         train_loss.jsonl in the model output dir (if it doesn't already exist),
         appends the subdict of the log & dumps the file.
         """
         # All processes get the logs from this node; only update from process 0.
         if not state.is_world_process_zero:
             return
 
+        # separate evaluation loss with train loss
         log_file_path = os.path.join(args.output_dir, "train_loss.jsonl")
+        eval_log_file_path = os.path.join(args.output_dir, "eval_loss.jsonl")
         if logs is not None and "loss" in logs and "epoch" in logs:
-            try:
-                # Take the subdict of the last log line; if any log_keys aren't part of this log
-                # object, asssume this line is something else, e.g., train completion, and skip.
-                log_obj = {
-                    "name": "loss",
-                    "data": {
-                        "epoch": round(logs["epoch"], 2),
-                        "step": state.global_step,
-                        "value": logs["loss"],
-                        "timestamp": datetime.isoformat(datetime.now()),
-                    },
-                }
-            except KeyError:
-                return
-
-            # append the current log to the jsonl file
-            with open(log_file_path, "a") as log_file:
-                log_file.write(f"{json.dumps(log_obj, sort_keys=True)}\n")
+            self._track_loss("loss", log_file_path, logs, state)
+        elif logs is not None and "eval_loss" in logs and "epoch" in logs:
+            self._track_loss("eval_loss", eval_log_file_path, logs, state)
+
+    def _track_loss(self, loss_key, log_file, logs, state):
+        try:
+            # Take the subdict of the last log line; if any log_keys aren't part of this log
+            # object, assume this line is something else, e.g., train completion, and skip.
+            log_obj = {
+                "name": loss_key,
+                "data": {
+                    "epoch": round(logs["epoch"], 2),
+                    "step": state.global_step,
+                    "value": logs[loss_key],
+                    "timestamp": datetime.isoformat(datetime.now()),
+                },
+            }
+        except KeyError:
+            return
+
+        # append the current log to the jsonl file
+        with open(log_file, "a") as f:
+            f.write(f"{json.dumps(log_obj, sort_keys=True)}\n")
 
 
 def train(