diff --git a/create_seed_checkpoint.sh b/create_seed_checkpoint.sh index 1abc77ec..3dfbde71 100755 --- a/create_seed_checkpoint.sh +++ b/create_seed_checkpoint.sh @@ -18,7 +18,6 @@ set -ex -export USE_LIBUV=1 TRAINER_DIR=${1:-/home/$USER/local/torchtitan} NGPU=1 LOG_RANK=0 diff --git a/multinode_trainer.slurm b/multinode_trainer.slurm index 09b94ef1..4bc495d3 100644 --- a/multinode_trainer.slurm +++ b/multinode_trainer.slurm @@ -53,7 +53,6 @@ export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond" export NCCL_BUFFSIZE=2097152 #export TORCH_DIST_INIT_BARRIER=1 export FI_EFA_SET_CUDA_SYNC_MEMOPS=0 -#export USE_LIBUV=1 CONFIG_FILE=${CONFIG_FILE:-"./train_configs/llama2_13b.toml"} dcgmi profile --pause