Skip to content

Commit

Permalink
we should not autocast
Browse files Browse the repository at this point in the history
  • Loading branch information
rizar committed Mar 3, 2025
1 parent 7333ddd commit edf0a26
Showing 1 changed file with 9 additions and 10 deletions.
19 changes: 9 additions & 10 deletions tapeagents/finetune/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,16 +203,15 @@ def toggle_sync(sync: bool):
torch.cuda.empty_cache()

do_optimizer_step = training_metrics.passes % args.gradient_accumulation_passes == 0
with torch.autocast("cuda"):
with toggle_sync(do_optimizer_step):
loss, this_step_rl_metrics = forward(model, batch)
for k, v in this_step_rl_metrics.items():
rl_metrics[k].append(v)
training_metrics.train_loss = loss.item()
training_metrics.lr = optimizer.param_groups[0]["lr"]
training_metrics.max_batch_len = max(batch["input_ids"].shape[1], training_metrics.max_batch_len)
training_metrics.min_batch_len = min(batch["input_ids"].shape[1], training_metrics.min_batch_len)
accelerator.backward(loss / args.gradient_accumulation_passes)
with toggle_sync(do_optimizer_step):
loss, this_step_rl_metrics = forward(model, batch)
for k, v in this_step_rl_metrics.items():
rl_metrics[k].append(v)
training_metrics.train_loss = loss.item()
training_metrics.lr = optimizer.param_groups[0]["lr"]
training_metrics.max_batch_len = max(batch["input_ids"].shape[1], training_metrics.max_batch_len)
training_metrics.min_batch_len = min(batch["input_ids"].shape[1], training_metrics.min_batch_len)
accelerator.backward(loss / args.gradient_accumulation_passes)

if not do_optimizer_step:
continue
Expand Down

0 comments on commit edf0a26

Please sign in to comment.