Skip to content

Commit

Permalink
Log grad norm aggregated over all ranks, not just rank zero (pytorch#…
Browse files Browse the repository at this point in the history
  • Loading branch information
ebsmothers authored and Ankur-singh committed Jan 18, 2025
1 parent ecf488e commit f90c6a9
Show file tree
Hide file tree
Showing 6 changed files with 6 additions and 6 deletions.
2 changes: 1 addition & 1 deletion recipes/dev/early_exit_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -951,7 +951,7 @@ def train(self) -> None:
grad_norm = torch.nn.utils.clip_grad_norm_(
self._model.parameters(),
max_norm=float(self._clip_grad_norm),
)
).full_tensor()
self._optimizer.step()
self._optimizer.zero_grad(set_to_none=True)

Expand Down
2 changes: 1 addition & 1 deletion recipes/full_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,7 @@ def train(self) -> None:
grad_norm = torch.nn.utils.clip_grad_norm_(
self._model.parameters(),
max_norm=float(self._clip_grad_norm),
)
).full_tensor()
self._optimizer.step()
self._optimizer.zero_grad(set_to_none=True)

Expand Down
2 changes: 1 addition & 1 deletion recipes/lora_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,7 +828,7 @@ def train(self) -> None:
grad_norm = torch.nn.utils.clip_grad_norm_(
self._model.parameters(),
max_norm=float(self._clip_grad_norm),
)
).full_tensor()
self._optimizer.step()
self._optimizer.zero_grad(set_to_none=True)
self._lr_scheduler.step()
Expand Down
2 changes: 1 addition & 1 deletion recipes/lora_finetune_distributed_multi_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,7 +857,7 @@ def train(self) -> None:
grad_norm = torch.nn.utils.clip_grad_norm_(
self._model.parameters(),
max_norm=float(self._clip_grad_norm),
)
).full_tensor()
self._optimizer.step()
self._optimizer.zero_grad(set_to_none=True)
self._lr_scheduler.step()
Expand Down
2 changes: 1 addition & 1 deletion recipes/qat_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,7 +857,7 @@ def train(self) -> None:
grad_norm = torch.nn.utils.clip_grad_norm_(
self._model.parameters(),
max_norm=float(self._clip_grad_norm),
)
).full_tensor()
self._optimizer.step()
self._optimizer.zero_grad(set_to_none=True)

Expand Down
2 changes: 1 addition & 1 deletion recipes/qat_lora_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,7 @@ def train(self) -> None:
grad_norm = torch.nn.utils.clip_grad_norm_(
self._model.parameters(),
max_norm=float(self._clip_grad_norm),
)
).full_tensor()
self._optimizer.step()
self._optimizer.zero_grad(set_to_none=True)
self._lr_scheduler.step()
Expand Down

0 comments on commit f90c6a9

Please sign in to comment.