From 8c55263eaf24d759fce08c1ff1cea3539430239b Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Fri, 30 Aug 2024 10:29:35 -0700 Subject: [PATCH 1/3] Add 3d+compile to test runner (#563) Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): * #564 * __->__ #563 Add test to test runner for 3d + compile composability --- test_runner.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test_runner.py b/test_runner.py index 245bf0ee..7d6d4063 100755 --- a/test_runner.py +++ b/test_runner.py @@ -244,6 +244,21 @@ def build_test_list(): requires_seed_checkpoint=True, ngpu=8, ), + OverrideDefinitions( + [ + [ + "--experimental.pipeline_parallel_degree 2", + "--experimental.pipeline_parallel_split_points layers.4", + "--training.data_parallel_degree 2", + "--training.tensor_parallel_degree 2", + "--training.compile", + ], + ], + "PP+DP+TP 3D test with torch.compile", + "3d_compile", + requires_seed_checkpoint=True, + ngpu=8, + ), OverrideDefinitions( [ [ From a99d2e5df7d6df0d99d8d5bd4db99ecb9f1014e6 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 30 Aug 2024 13:15:41 -0700 Subject: [PATCH 2/3] remove float8 install as H100 is not available in CI yet ghstack-source-id: fbbf3fdd2d52334b76e45bac8b22a69de71baa5f Pull Request resolved: https://github.com/pytorch/torchtitan/pull/565 --- .github/workflows/integration_test_4gpu.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/integration_test_4gpu.yaml b/.github/workflows/integration_test_4gpu.yaml index a82bd96c..993ae797 100644 --- a/.github/workflows/integration_test_4gpu.yaml +++ b/.github/workflows/integration_test_4gpu.yaml @@ -38,6 +38,5 @@ jobs: pip config --user set global.progress_bar off python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 - USE_CPP=0 python -m pip install git+https://github.com/pytorch/ao.git mkdir artifacts-to-be-uploaded python ./test_runner.py artifacts-to-be-uploaded --ngpu 4 From ac90c36e39c6274f9beaf76922627665b6553905 Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Tue, 3 Sep 2024 17:28:15 -0700 Subject: [PATCH 3/3] Removed unused dw tensor in Triton RMSNorm ghstack-source-id: c2337c6f976b41288498b7f3aa9b6f3d54d49ad9 Pull Request resolved: https://github.com/pytorch/torchtitan/pull/567 --- torchtitan/models/norms.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchtitan/models/norms.py b/torchtitan/models/norms.py index 26645330..31527452 100644 --- a/torchtitan/models/norms.py +++ b/torchtitan/models/norms.py @@ -284,7 +284,6 @@ def backward(ctx, dy): M, N = dy.shape dx = torch.empty_like(x) - dw = torch.empty_like(weight) sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)