pytorch · nathan-az · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -21,6 +21,7 @@
     init_process_group,
 )
 from torch.distributed._tensor import DTensor
+from torch.distributed.tensor.experimental import implicit_replication
 from torch.distributed.tensor.parallel import parallelize_module
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader, DistributedSampler
@@ -818,6 +819,8 @@ def train(self) -> None:
                     torch.distributed.all_reduce(running_loss)
 
                     # We multiply by world_size to undo FSDP2 gradient normalization.
+                    # TODO: confirm whether below should divide by `tensor_parallel_dim` to
+                    # adjust grad norm for overcounting tokens from the all-reduce
                     current_loss = current_loss * (self.world_size / num_tokens)
 
                 current_loss.backward()
@@ -830,16 +833,27 @@ def train(self) -> None:
                         # This will ensure that the logged loss matches what we're optimizing
                         torch.distributed.all_reduce(running_loss)
                         # Manually scale the gradients from unnormalized loss by total # of tokens
-                        # We multiply by world_size to undo FSDP2 gradient normalization.
-                        training.scale_grads(self._model, self.world_size / num_tokens)
-                        if self._clip_grad_norm is not None:
-                            grad_norm = torch.nn.utils.clip_grad_norm_(
-                                self._model.parameters(),
-                                max_norm=float(self._clip_grad_norm),
-                            )
-                            # If sharded, collect the DTensor here
-                            if isinstance(grad_norm, DTensor):
-                                grad_norm = grad_norm.full_tensor()
+                        # We multiply by dp_size to undo FSDP2 gradient normalization.
+                        training.scale_grads(self._model, self.dp_size / num_tokens)
+                        with implicit_replication():
+                            if self._clip_grad_norm is not None:
+                                grad_norm = torch.nn.utils.clip_grad_norm_(
+                                    self._model.parameters(),
+                                    max_norm=float(self._clip_grad_norm),
+                                )
+                            else:
+                                grad_norm = torch.nn.utils.get_total_norm(
+                                    [
+                                        p.grad 
+                                        for p in self._model.parameters()
+                                        if p.grad is not None
+                                    ],
+                                    norm_type=2,
+                                    error_if_nonfinite=False,
+                                    foreach=True,
+                                )
+                        if isinstance(grad_norm, DTensor):
+                            grad_norm = grad_norm.full_tensor()
                         self._optimizer.step()
                         self._optimizer.zero_grad(set_to_none=True)
 
@@ -871,15 +885,20 @@ def train(self) -> None:
                                     else self._optim_ckpt_wrapper
                                 ),
                             ),
+                            # secondary tp dim division adjusts for tp ranks share the tokens during training
+                            # so the above all-reduce over-counts
                             "tokens_per_second_per_gpu": num_tokens
-                            / (time_per_step * self.world_size),
+                            / (
+                                time_per_step
+                                * self.world_size
+                                * self.tensor_parallel_dim
+                            ),
+                            "grad_norm": grad_norm,
                         }
                         if self._log_peak_memory_stats:
                             log_dict.update(
                                 training.get_memory_stats(device=self._device)
                             )
-                        if self._clip_grad_norm is not None:
-                            log_dict.update({"grad_norm": grad_norm})
                         self._metric_logger.log_dict(
                             log_dict,
                             step=self.global_step,