Add comments, account for optimizer_in_bwd case.

pytorch · Dec 18, 2024 · 34906b2 · 34906b2
1 parent f07241b
commit 34906b2
Show file tree

Hide file tree

Showing 6 changed files with 12 additions and 2 deletions.
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -765,7 +765,9 @@ def train(self) -> None:
                 if self._optimizer_in_bwd:
                     torch.distributed.all_reduce(num_tokens)
                     torch.distributed.all_reduce(running_loss)
-                    current_loss = current_loss / num_tokens
+
+                    # We multiply by world_size to undo FSDP2 gradient normalization.
+                    current_loss = current_loss * (world_size / num_tokens)
 
                 current_loss.backward()
 
@@ -777,6 +779,7 @@ def train(self) -> None:
                         # This will ensure that the logged loss matches what we're optimizing
                         torch.distributed.all_reduce(running_loss)
                         # Manually scale the gradients from unnormalized loss by total # of tokens
+                        # We multiply by world_size to undo FSDP2 gradient normalization.
                         training.scale_grads(self._model, world_size / num_tokens)
                         if self._clip_grad_norm is not None:
                             grad_norm = torch.nn.utils.clip_grad_norm_(

diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py
@@ -875,6 +875,7 @@ def train(self) -> None:
                     torch.distributed.all_reduce(running_class_loss)
                     torch.distributed.all_reduce(running_kd_loss)
                     # Manually scale the gradients from unnormalized loss by total # of tokens
+                    # We multiply by world_size to undo FSDP2 gradient normalization.
                     training.scale_grads(self._model, world_size / num_tokens)
                     class_loss_to_log = running_class_loss.item() / num_tokens
                     kd_loss_to_log = running_kd_loss.item() / num_tokens

diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -825,6 +825,7 @@ def train(self) -> None:
                     # This will ensure that the logged loss matches what we're optimizing
                     torch.distributed.all_reduce(running_loss)
                     # Manually scale the gradients from unnormalized loss by total # of tokens
+                    # We multiply by world_size to undo FSDP2 gradient normalization.
                     training.scale_grads(self._model, world_size / num_tokens)
                     if self._clip_grad_norm is not None:
                         grad_norm = torch.nn.utils.clip_grad_norm_(

diff --git a/recipes/lora_finetune_distributed_multi_dataset.py b/recipes/lora_finetune_distributed_multi_dataset.py
@@ -853,6 +853,7 @@ def train(self) -> None:
                     # This will ensure that the logged loss matches what we're optimizing
                     torch.distributed.all_reduce(running_loss)
                     # Manually scale the gradients from unnormalized loss by total # of tokens
+                    # We multiply by world_size to undo FSDP2 gradient normalization.
                     training.scale_grads(self._model, world_size / num_tokens)
                     if self._clip_grad_norm is not None:
                         grad_norm = torch.nn.utils.clip_grad_norm_(

diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
@@ -829,7 +829,9 @@ def train(self) -> None:
                 if self._optimizer_in_bwd:
                     torch.distributed.all_reduce(num_tokens)
                     torch.distributed.all_reduce(running_loss)
-                    current_loss = current_loss / num_tokens
+
+                    # We multiply by world_size to undo FSDP2 gradient normalization.
+                    current_loss = current_loss * (world_size / num_tokens)
 
                 current_loss.backward()
 
@@ -841,6 +843,7 @@ def train(self) -> None:
                         # This will ensure that the logged loss matches what we're optimizing
                         torch.distributed.all_reduce(running_loss)
                         # Manually scale the gradients from unnormalized loss by total # of tokens
+                        # We multiply by world_size to undo FSDP2 gradient normalization.
                         training.scale_grads(self._model, world_size / num_tokens)
                         if self._clip_grad_norm is not None:
                             grad_norm = torch.nn.utils.clip_grad_norm_(

diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py
@@ -863,6 +863,7 @@ def train(self) -> None:
                     # This will ensure that the logged loss matches what we're optimizing
                     torch.distributed.all_reduce(running_loss)
                     # Manually scale the gradients from unnormalized loss by total # of tokens
+                    # We multiply by world_size to undo FSDP2 gradient normalization.
                     training.scale_grads(self._model, world_size / num_tokens)
                     if self._clip_grad_norm is not None:
                         grad_norm = torch.nn.utils.clip_grad_norm_(