pytorch · weifengpy · Jul 16, 2024 · Jun 12, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/test_runner.py b/test_runner.py
@@ -273,6 +273,39 @@ def build_test_list():
             "fsdp2_mem_tracker",
             ngpu=4,
         ),
+        OverrideDefinitions(
+            [
+                [
+                    "--training.fp8_linear",
+                ]
+            ],
+            "FSDP2 with original dtype",
+            "fp8_fsdp2_orig_all_gather",
+            ngpu=4,
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--training.fp8_linear",
+                    "--training.enable_fsdp_fp8_all_gather",
+                ]
+            ],
+            "FSDP2 with fp8 all-gather",
+            "fp8_fsdp2_fp8_all_gather",
+            ngpu=4,
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--training.fp8_linear",
+                    "--training.enable_fsdp_fp8_all_gather",
+                    "--training.precompute_float8_dynamic_scale_for_fsdp",
+                ]
+            ],
+            "FSDP2 with fp8 all-gather and precomputed dynamic scales",
+            "fp8_fsdp2_fp8_all_gather_precompute_dynamic_scales",
+            ngpu=4,
+        ),
     ]
     return integration_tests_flavors
 

diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -347,6 +347,18 @@ def __init__(self):
                 here: https://github.com/pytorch-labs/float8_experimental
             """,
         )
+        self.parser.add_argument(
+            "--training.enable_fsdp_fp8_all_gather",
+            action="store_true",
+            default=False,
+            help="Whether enable fp8 all-gather in FSDP",
+        )
+        self.parser.add_argument(
+            "--training.precompute_float8_dynamic_scale_for_fsdp",
+            action="store_true",
+            default=False,
+            help="Whether precompute fp8 scales dynamically for FSDP",
+        )
         self.parser.add_argument(
             "--training.gc_freq",
             type=int,

diff --git a/torchtitan/float8_linear.py b/torchtitan/float8_linear.py
@@ -12,13 +12,30 @@
 
 # Note: Performance
 # Float8 experimental is intended to be ran under `torch.compile`` for competitive performance
+import contextlib
 
+import float8_experimental.config as config
+
+import torch
 import torch.nn as nn
+from float8_experimental.float8_linear import TensorScalingType
 
 from torchtitan.config_manager import JobConfig
 from torchtitan.logging_utils import logger
 
 
+@contextlib.contextmanager
+def set_enable_fsdp_fp8_all_gather(enable_fsdp_fp8_all_gather: bool):
+    prev = config.enable_fsdp_fp8_all_gather
+    torch.distributed.barrier()
+    config.enable_fsdp_fp8_all_gather = enable_fsdp_fp8_all_gather
+    try:
+        yield
+    finally:
+        torch.distributed.barrier()
+        config.enable_fsdp_fp8_all_gather = prev
+
+
 def build_fp8_linear(model: nn.Module, job_config: JobConfig):
     """
     This function converts the linear layers to `Float8Linear`. Note that today,
@@ -27,8 +44,8 @@ def build_fp8_linear(model: nn.Module, job_config: JobConfig):
     This will mutate the model inplace.
     """
     use_fp8_linear = job_config.training.fp8_linear
+    enable_fsdp_fp8_all_gather = job_config.training.enable_fsdp_fp8_all_gather
     try:
-        from float8_experimental.float8_linear import Float8Linear
         from float8_experimental.float8_linear_utils import (
             swap_linear_with_float8_linear,
         )
@@ -38,5 +55,10 @@ def build_fp8_linear(model: nn.Module, job_config: JobConfig):
         ) from exc
     if use_fp8_linear:
         # Mutates the model inplace replacing instances of torch.nn.Linear with Float8Linear
-        swap_linear_with_float8_linear(model, Float8Linear)
-        logger.info("Swapped to Float8Linear layers")
+        with set_enable_fsdp_fp8_all_gather(enable_fsdp_fp8_all_gather):
+            swap_linear_with_float8_linear(
+                model, scaling_type_w=TensorScalingType.DYNAMIC
+            )
+        logger.info(
+            f"Swapped to Float8Linear layers with {enable_fsdp_fp8_all_gather=}"
+        )
diff --git a/train.py b/train.py
@@ -19,6 +19,7 @@
 
 import torch
 import torch.nn.functional as F
+from float8_experimental.fsdp_utils import precompute_float8_dynamic_scale_for_fsdp
 from torch.distributed import destroy_process_group
 from torch.distributed.checkpoint.stateful import Stateful
 from torch.distributed.elastic.multiprocessing.errors import record
@@ -218,6 +219,11 @@ def loss_fn(pred, labels):
     # apply fp8 linear module swap
     if job_config.training.fp8_linear:
         build_fp8_linear(whole_model, job_config)
+    else:
+        if job_config.training.enable_fsdp_fp8_all_gather:
+            raise ValueError(
+                "enable_fsdp_fp8_all_gather can only be used with fp8_linear"
+            )
 
     # log model size
     model_param_count = get_num_params(whole_model)
@@ -398,6 +404,17 @@ def loss_fn(pred, labels):
             optimizers.step()
             lr_schedulers.step()
 
+            if job_config.training.precompute_float8_dynamic_scale_for_fsdp:
+                if (not job_config.training.fp8_linear) or (
+                    not job_config.training.enable_fsdp_fp8_all_gather
+                ):
+                    raise ValueError(
+                        "precompute_float8_dynamic_scale_for_fsdp is only "
+                        "supported when fp8_linear and "
+                        "enable_fsdp_fp8_all_gather are both enabled"
+                    )
+                precompute_float8_dynamic_scale_for_fsdp(model)
+
             losses_since_last_log.append(loss)
 
             # log metrics