-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathmultinode.sh
29 lines (24 loc) · 858 Bytes
/
multinode.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/bin/bash
#SBATCH -J mcquic_pretraining
#SBATCH -p A800
#SBATCH -N 2
#SBATCH --ntasks=2
#SBATCH --gres=gpu:a800:8
#SBATCH --cpus-per-task=48
#SBATCH --output=slurm/slurm-%j.out
#SBATCH --error=slurm/slurm-%j.err
# ntask should be equal to N
export HF_ENDPOINT="https://hf-mirror.com"
export PYTHONPATH="/ssdfs/datahome/tj24011/workspace/McQuic"
module load cuda/12.1
source /ssdfs/datahome/tj24011/software/miniconda3/etc/profile.d/conda.sh
conda activate mcquic
# Graceful restart = 3, for handling data issue
TOKENIZERS_PARALLELISM=false NCCL_P2P_LEVEL=NVL OMP_NUM_THREADS=16 srun torchrun \
--nnodes 2 \
--max_restarts 3 \
--nproc_per_node 8 \
--rdzv_id $RANDOM \
--rdzv_backend c10d \
--rdzv_endpoint $HOSTNAME:19936 \
/ssdfs/datahome/tj24011/workspace/McQuic/mcquic/train/__main__.py /ssdfs/datahome/tj24011/workspace/McQuic/configs/a800_16.yaml