Skip to content

Commit

Permalink
Merge pull request foundation-model-stack#51 from foundation-model-st…
Browse files Browse the repository at this point in the history
…ack/validation-loss-file

feat: track validation loss in logs file
  • Loading branch information
anhuong authored Feb 20, 2024
2 parents 24e7385 + 1e817ca commit b9380e4
Showing 1 changed file with 26 additions and 19 deletions.
45 changes: 26 additions & 19 deletions tuning/sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,34 +47,41 @@ def __init__(self, logger):
self.logger = logger

def on_log(self, args, state, control, logs=None, **kwargs):
"""Checks if this log contains keys of interest, e.g., los, and if so, creates
"""Checks if this log contains keys of interest, e.g., loss, and if so, creates
train_loss.jsonl in the model output dir (if it doesn't already exist),
appends the subdict of the log & dumps the file.
"""
# All processes get the logs from this node; only update from process 0.
if not state.is_world_process_zero:
return

# separate evaluation loss with train loss
log_file_path = os.path.join(args.output_dir, "train_loss.jsonl")
eval_log_file_path = os.path.join(args.output_dir, "eval_loss.jsonl")
if logs is not None and "loss" in logs and "epoch" in logs:
try:
# Take the subdict of the last log line; if any log_keys aren't part of this log
# object, asssume this line is something else, e.g., train completion, and skip.
log_obj = {
"name": "loss",
"data": {
"epoch": round(logs["epoch"], 2),
"step": state.global_step,
"value": logs["loss"],
"timestamp": datetime.isoformat(datetime.now()),
},
}
except KeyError:
return

# append the current log to the jsonl file
with open(log_file_path, "a") as log_file:
log_file.write(f"{json.dumps(log_obj, sort_keys=True)}\n")
self._track_loss("loss", log_file_path, logs, state)
elif logs is not None and "eval_loss" in logs and "epoch" in logs:
self._track_loss("eval_loss", eval_log_file_path, logs, state)

def _track_loss(self, loss_key, log_file, logs, state):
try:
# Take the subdict of the last log line; if any log_keys aren't part of this log
# object, assume this line is something else, e.g., train completion, and skip.
log_obj = {
"name": loss_key,
"data": {
"epoch": round(logs["epoch"], 2),
"step": state.global_step,
"value": logs[loss_key],
"timestamp": datetime.isoformat(datetime.now()),
},
}
except KeyError:
return

# append the current log to the jsonl file
with open(log_file, "a") as f:
f.write(f"{json.dumps(log_obj, sort_keys=True)}\n")


def train(
Expand Down

0 comments on commit b9380e4

Please sign in to comment.