From 8c55263eaf24d759fce08c1ff1cea3539430239b Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang96@gmail.com>
Date: Fri, 30 Aug 2024 10:29:35 -0700
Subject: [PATCH 1/3] Add 3d+compile to test runner (#563)

Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at
bottom):
* #564
* __->__ #563

Add test to test runner for 3d + compile composability
---
 test_runner.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/test_runner.py b/test_runner.py
index 245bf0ee..7d6d4063 100755
--- a/test_runner.py
+++ b/test_runner.py
@@ -244,6 +244,21 @@ def build_test_list():
             requires_seed_checkpoint=True,
             ngpu=8,
         ),
+        OverrideDefinitions(
+            [
+                [
+                    "--experimental.pipeline_parallel_degree 2",
+                    "--experimental.pipeline_parallel_split_points layers.4",
+                    "--training.data_parallel_degree 2",
+                    "--training.tensor_parallel_degree 2",
+                    "--training.compile",
+                ],
+            ],
+            "PP+DP+TP 3D test with torch.compile",
+            "3d_compile",
+            requires_seed_checkpoint=True,
+            ngpu=8,
+        ),
         OverrideDefinitions(
             [
                 [

From a99d2e5df7d6df0d99d8d5bd4db99ecb9f1014e6 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <lty@fb.com>
Date: Fri, 30 Aug 2024 13:15:41 -0700
Subject: [PATCH 2/3] remove float8 install as H100 is not available in CI yet

ghstack-source-id: fbbf3fdd2d52334b76e45bac8b22a69de71baa5f
Pull Request resolved: https://github.com/pytorch/torchtitan/pull/565
---
 .github/workflows/integration_test_4gpu.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/integration_test_4gpu.yaml b/.github/workflows/integration_test_4gpu.yaml
index a82bd96c..993ae797 100644
--- a/.github/workflows/integration_test_4gpu.yaml
+++ b/.github/workflows/integration_test_4gpu.yaml
@@ -38,6 +38,5 @@ jobs:
         pip config --user set global.progress_bar off
 
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
-        USE_CPP=0 python -m pip install git+https://github.com/pytorch/ao.git
         mkdir artifacts-to-be-uploaded
         python ./test_runner.py artifacts-to-be-uploaded --ngpu 4

From ac90c36e39c6274f9beaf76922627665b6553905 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 3 Sep 2024 17:28:15 -0700
Subject: [PATCH 3/3] Removed unused dw tensor in Triton RMSNorm

ghstack-source-id: c2337c6f976b41288498b7f3aa9b6f3d54d49ad9
Pull Request resolved: https://github.com/pytorch/torchtitan/pull/567
---
 torchtitan/models/norms.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchtitan/models/norms.py b/torchtitan/models/norms.py
index 26645330..31527452 100644
--- a/torchtitan/models/norms.py
+++ b/torchtitan/models/norms.py
@@ -284,7 +284,6 @@ def backward(ctx, dy):
 
         M, N = dy.shape
         dx = torch.empty_like(x)
-        dw = torch.empty_like(weight)
 
         sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
         _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)