From 8afa5452441b193cf66f7e2a12a44950764dc8c2 Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Mon, 19 Aug 2024 11:14:20 -0700 Subject: [PATCH] [Not for land] Added GPT-2-like config [ghstack-poisoned] --- torchtitan/models/llama/__init__.py | 1 + train_configs/gpt2.toml | 54 +++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 train_configs/gpt2.toml diff --git a/torchtitan/models/llama/__init__.py b/torchtitan/models/llama/__init__.py index 887a96cd..fdeff177 100644 --- a/torchtitan/models/llama/__init__.py +++ b/torchtitan/models/llama/__init__.py @@ -30,6 +30,7 @@ llama3_configs = { "debugmodel": ModelArgs(dim=256, n_layers=8, n_heads=16, rope_theta=500000), + "gpt2": ModelArgs(dim=768, n_layers=12, n_heads=12, rope_theta=500000), "8B": ModelArgs( dim=4096, n_layers=32, diff --git a/train_configs/gpt2.toml b/train_configs/gpt2.toml new file mode 100644 index 00000000..41222619 --- /dev/null +++ b/train_configs/gpt2.toml @@ -0,0 +1,54 @@ +# torchtitan Config.toml +# NOTE: this toml config is a preset for 64 A100 GPUs. + +[job] +dump_folder = "./outputs" +description = "GPT-2 training" + +[profiling] +enable_profiling = false +save_traces_folder = "profile_trace" +profile_freq = 10 + +[metrics] +log_freq = 10 +enable_tensorboard = false +save_tb_folder = "tb" + +[model] +name = "llama3" +flavor = "gpt2" +norm_type = "rmsnorm" # layernorm / np_layernorm / rmsnorm / fused_rmsnorm +tokenizer_path = "./torchtitan/datasets/tokenizer/original/tokenizer.model" + +[optimizer] +name = "AdamW" +lr = 3e-4 +fused = true + +[training] +batch_size = 16 +seq_len = 8192 +warmup_steps = 200 # lr scheduler warm up +max_norm = 1.0 # grad norm clipping +steps = 100 +data_parallel_degree = -1 +tensor_parallel_degree = 1 +compile = true +dataset = "c4" + +[experimental] +pipeline_parallel_degree = 1 + +[checkpoint] +enable_checkpoint = false +folder = "checkpoint" +interval_type = "steps" +interval = 500 +model_weights_only = false +export_dtype = "float32" +async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"] + +[activation_checkpoint] +mode = 'none' # ['none', 'selective', 'full'] +selective_ac_option = 'op' # 'int' = ac every positive int layer or 'op', ac based on ops policy