-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathlaunch_multinode.sh
42 lines (38 loc) · 1.36 KB
/
launch_multinode.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
#SBATCH --job-name=llama7b-2-multinode
#SBATCH --nodes=2
#SBATCH --mem=0
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=24
#SBATCH --gres=gpu:a100:4
#SBATCH --output=llama-2-7b.%j.out
#SBATCH --error=llama-2-7b.%j.err
#SBATCH --partition=a100
#SBATCH --qos=your_assigned_qos # CHANGE
#SBATCH --open-mode=append
#SBATCH --wait-all-nodes=1
#SBATCH --time=3-00
export MASTER_ADDR="$(hostname --fqdn)"
export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')"
export RDVZ_ID=$RANDOM
echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT"
export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag.
export NCCL_DEBUG=WARN
export NCCL_DEBUG_SUBSYS=WARN
# export TORCH_DISTRIBUTED_DEBUG=DETAIL # Uncomment these flags for debugging communication
# export TORCH_CPP_LOG_LEVEL=INFO
export LOGLEVEL=INFO
export PYTHONFAULTHANDLER=1
# export CUDA_LAUNCH_BLOCKING=0
srun -p $SLURM_JOB_PARTITION \
-c $SLURM_CPUS_ON_NODE \
-N $SLURM_JOB_NUM_NODES \
--mem=0 \
--gres=gpu:$SLURM_JOB_PARTITION:$SLURM_GPUS_ON_NODE \
bash -c 'torchrun \
--nproc-per-node=$SLURM_GPUS_ON_NODE \
--nnodes=$SLURM_JOB_NUM_NODES \
--rdzv-endpoint $MASTER_ADDR:$MASTER_PORT \
--rdzv-id $RDVZ_ID \
--rdzv-backend c10d \
llama_example.py --yaml_path ../configs/config.yaml'