Skip to content

Commit

Permalink
Update on "[WIP] zero bubble"
Browse files Browse the repository at this point in the history
To run zb test: 
`python test_runner.py ./test_out --test pp_zb`

internal mast run:
`
torchx run mast.py:train --additional_folders /home/howardhuang/local/torchtitan --twtask_bootstrap_script run_torchtitan.sh --h "grandteton" --nodes 8 train_configs/debug_model_3d_mast.toml
`
  


[ghstack-poisoned]
  • Loading branch information
H-Huang committed Sep 26, 2024
2 parents d79c77f + d59b1af commit 475b07d
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions train_configs/debug_model_3d_mast.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,18 @@ dataset = "c4"
[experimental]
# pipeline_parallel_degree = 1
# for 70B
# pipeline_parallel_degree = 4
# pipeline_parallel_split_points="layers.10,layers.20,layers.30,layers.40,layers.50,layers.60,layers.70"
pipeline_parallel_degree = 4
pipeline_parallel_split_points="layers.10,layers.20,layers.30,layers.40,layers.50,layers.60,layers.70"
# pipeline_parallel_schedule="interleaved_1f1b"
pipeline_parallel_schedule="zb"

# for 405B
# pipeline_parallel_degree = 4
# pipeline_parallel_split_points="layers.16,layers.32,layers.48,layers.64,layers.80,layers.96,layers.112"
# pipeline_parallel_degree = 8
# pipeline_parallel_split_points="layers.8,layers.16,layers.24,layers.32,layers.40,layers.48,layers.56,layers.64,layers.72,layers.80,layers.88,layers.96,layers.104,layers.112,layers.120"
pipeline_parallel_degree = 16
pipeline_parallel_split_points="layers.4,layers.8,layers.12,layers.16,layers.20,layers.24,layers.28,layers.32,layers.36,layers.40,layers.44,layers.48,layers.52,layers.56,layers.60,layers.64,layers.68,layers.72,layers.76,layers.80,layers.84,layers.88,layers.92,layers.96,layers.100,layers.104,layers.108,layers.112,layers.116,layers.120,layers.124"
# pipeline_parallel_degree = 16
# pipeline_parallel_split_points="layers.4,layers.8,layers.12,layers.16,layers.20,layers.24,layers.28,layers.32,layers.36,layers.40,layers.44,layers.48,layers.52,layers.56,layers.60,layers.64,layers.68,layers.72,layers.76,layers.80,layers.84,layers.88,layers.92,layers.96,layers.100,layers.104,layers.108,layers.112,layers.116,layers.120,layers.124"
# pipeline_parallel_degree = 32
# pipeline_parallel_split_points="layers.2,layers.4,layers.6,layers.8,layers.10,layers.12,layers.14,layers.16,layers.18,layers.20,layers.22,layers.24,layers.26,layers.28,layers.30,layers.32,layers.34,layers.36,layers.38,layers.40,layers.42,layers.44,layers.46,layers.48,layers.50,layers.52,layers.54,layers.56,layers.58,layers.60,layers.62,layers.64,layers.66,layers.68,layers.70,layers.72,layers.74,layers.76,layers.78,layers.80,layers.82,layers.84,layers.86,layers.88,layers.90,layers.92,layers.94,layers.96,layers.98,layers.100,layers.102,layers.104,layers.106,layers.108,layers.110,layers.112,layers.114,layers.116,layers.118,layers.120,layers.122,layers.124,layers.126"
# pipeline_parallel_schedule="interleaved_1f1b"
Expand Down

0 comments on commit 475b07d

Please sign in to comment.