Skip to content

Commit

Permalink
perlmutter runs
Browse files Browse the repository at this point in the history
  • Loading branch information
Shuai Jiang committed Jun 11, 2024
1 parent 8489b69 commit 0eae356
Show file tree
Hide file tree
Showing 7 changed files with 228 additions and 9 deletions.
6 changes: 3 additions & 3 deletions examples/bert-training/src/get_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ def obtain_dataset(percent_data:float = 0.01, seq_len: int = 128):
split = f'train[:{int(percent_data)}]'
else:
split = f'train[:{int(percent_data * 100)}%]'
bookcorpus_train = load_dataset('bookcorpus', split=split)
wiki_train = load_dataset("wikipedia", "20220301.simple", split=split)
bookcorpus_train = load_dataset('bookcorpus', split=split, trust_remote_code=True)
wiki_train = load_dataset("wikipedia", "20220301.simple", split=split, trust_remote_code=True)

# bookcorpus_train = load_dataset('bookcorpus', split=f'train[0:25000]')
# wiki_train = load_dataset("wikipedia", "20220301.simple", split=f'train[0:25000]')
Expand Down Expand Up @@ -172,4 +172,4 @@ def filter_short(examples):
)
# print(tokenized_datasets)

return MyBERTDataset(tokenized_datasets, tokenizer, seq_len), tokenizer.vocab_size
return MyBERTDataset(tokenized_datasets, tokenizer, seq_len), tokenizer.vocab_size
2 changes: 1 addition & 1 deletion examples/bert-training/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def test(rank, model, test_loader, compose, device):
# Parallel printing helper function
def root_print(rank, s):
if rank == 0:
print(s)
print(s, flush='True')

class ScheduledOptim():
'''A simple wrapper class for learning rate scheduling'''
Expand Down
47 changes: 47 additions & 0 deletions examples/bert-training/src/pl_n16.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash
#SBATCH -A m1327
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH --time=00:30:00
#SBATCH -N 4 # Nodes
#SBATCH --ntasks-per-node=4
#SBATCh -c 32 # https://docs.nersc.gov/systems/perlmutter/running-jobs/#1-node-4-tasks-4-gpus-1-gpu-visible-to-each-task
#SBATCH --gpus-per-task=1 # Number of GPUs per MPI task; each rank has one
#SBATCH --gpu-bind=none
#SBATCH --output=ml_n16.out
#SBATCH --error=ml_n16.err


export HF_DATASETS_CACHE=$SCRATCH/huggingface_cache
export HF_HOME=$HF_DATASETS_CACHE

export SLURM_CPU_BIND="cores"
module load PrgEnv-gnu cray-mpich cudatoolkit craype-accel-nvidia80 python
conda activate gpu-aware-mpi
export MPICH_GPU_SUPPORT_ENABLED=1

# -------------------- GENERATE TIMINGS ------------------------------ (not accuracy/long runs)
BATCH_SIZE=128
EPOCHS=2
PDATA=2500

# First generate the model; will auto leave with serial file
#srun -n 8 python main.py --percent-data=$PDATA --steps 32 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1

# First generate the model; will auto leave with serial file
#srun -n 8 python main.py --percent-data=$PDATA --steps 64 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1

# First generate the model; will auto leave with serial file
srun -n 16 python main.py --percent-data=$PDATA --steps 128 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1

srun -n 16 python main.py --percent-data=$PDATA --steps 192 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1
# ----------------------------------------------------------


# https://docs.nersc.gov/development/languages/python/parallel-python/#mpi4py
# "applications using mpi4py must be launched via srun"

#srun ./select_gpu_device python test-gpu-aware-mpi.py
#srun -n 2 -c 32 python main.py --steps 12 --channels 8 --batch-size 50 --log-interval 100 --epochs 20
# srun python main.py
# srun --ntasks 4 --gpus-per-task 1 -c 32 --gpu-bind=none python main_noDP.py > 4.out
47 changes: 47 additions & 0 deletions examples/bert-training/src/pl_n2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash
#SBATCH -A m1327
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH --time=00:30:00
#SBATCH -N 1 # Nodes
#SBATCH --ntasks-per-node=2
#SBATCh -c 64 # https://docs.nersc.gov/systems/perlmutter/running-jobs/#1-node-4-tasks-4-gpus-1-gpu-visible-to-each-task
#SBATCH --gpus-per-task=1 # Number of GPUs per MPI task; each rank has one
#SBATCH --gpu-bind=none
#SBATCH --output=ml_n2.out
#SBATCH --error=ml_n2.err


export HF_DATASETS_CACHE=$SCRATCH/huggingface_cache
export HF_HOME=$HF_DATASETS_CACHE

export SLURM_CPU_BIND="cores"
module load PrgEnv-gnu cray-mpich cudatoolkit craype-accel-nvidia80 python
conda activate gpu-aware-mpi
export MPICH_GPU_SUPPORT_ENABLED=1

# -------------------- GENERATE TIMINGS ------------------------------ (not accuracy/long runs)
BATCH_SIZE=128
EPOCHS=2
PDATA=2500

# First generate the model; will auto leave with serial file
#srun -n 2 python main.py --percent-data=$PDATA --steps 32 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1

# First generate the model; will auto leave with serial file
#srun -n 2 python main.py --percent-data=$PDATA --steps 64 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1

# First generate the model; will auto leave with serial file
srun -n 2 python main.py --percent-data=$PDATA --steps 128 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1

srun -n 2 python main.py --percent-data=$PDATA --steps 192 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1
# ----------------------------------------------------------


# https://docs.nersc.gov/development/languages/python/parallel-python/#mpi4py
# "applications using mpi4py must be launched via srun"

#srun ./select_gpu_device python test-gpu-aware-mpi.py
#srun -n 2 -c 32 python main.py --steps 12 --channels 8 --batch-size 50 --log-interval 100 --epochs 20
# srun python main.py
# srun --ntasks 4 --gpus-per-task 1 -c 32 --gpu-bind=none python main_noDP.py > 4.out
47 changes: 47 additions & 0 deletions examples/bert-training/src/pl_n4.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash
#SBATCH -A m1327
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH --time=00:25:00
#SBATCH -N 1 # Nodes
#SBATCH --ntasks-per-node=4
#SBATCh -c 32 # https://docs.nersc.gov/systems/perlmutter/running-jobs/#1-node-4-tasks-4-gpus-1-gpu-visible-to-each-task
#SBATCH --gpus-per-task=1 # Number of GPUs per MPI task; each rank has one
#SBATCH --gpu-bind=none # This seems important; idk why
#SBATCH --output=ml_n4.out
#SBATCH --error=ml_n4.err


export HF_DATASETS_CACHE=$SCRATCH/huggingface_cache
export HF_HOME=$HF_DATASETS_CACHE

export SLURM_CPU_BIND="cores"
module load PrgEnv-gnu cray-mpich cudatoolkit craype-accel-nvidia80 python
conda activate gpu-aware-mpi
export MPICH_GPU_SUPPORT_ENABLED=1

# -------------------- GENERATE TIMINGS ------------------------------ (not accuracy/long runs)
BATCH_SIZE=128
EPOCHS=2
PDATA=2500

# First generate the model; will auto leave with serial file
#srun -n 4 python main.py --percent-data=$PDATA --steps 32 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1

# First generate the model; will auto leave with serial file
#srun -n 4 python main.py --percent-data=$PDATA --steps 64 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1

# First generate the model; will auto leave with serial file
srun -n 4 python main.py --percent-data=$PDATA --steps 128 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1

srun -n 4 python main.py --percent-data=$PDATA --steps 192 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1
# ----------------------------------------------------------


# https://docs.nersc.gov/development/languages/python/parallel-python/#mpi4py
# "applications using mpi4py must be launched via srun"

#srun ./select_gpu_device python test-gpu-aware-mpi.py
#srun -n 2 -c 32 python main.py --steps 12 --channels 8 --batch-size 50 --log-interval 100 --epochs 20
# srun python main.py
# srun --ntasks 4 --gpus-per-task 1 -c 32 --gpu-bind=none python main_noDP.py > 4.out
47 changes: 47 additions & 0 deletions examples/bert-training/src/pl_n8.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash
#SBATCH -A m1327
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH --time=00:25:00
#SBATCH -N 2 # Nodes
#SBATCH --ntasks-per-node=4
#SBATCh -c 32 # https://docs.nersc.gov/systems/perlmutter/running-jobs/#1-node-4-tasks-4-gpus-1-gpu-visible-to-each-task
#SBATCH --gpus-per-task=1 # Number of GPUs per MPI task; each rank has one
#SBATCH --gpu-bind=none
#SBATCH --output=ml_n8.out
#SBATCH --error=ml_n8.err


export HF_DATASETS_CACHE=$SCRATCH/huggingface_cache
export HF_HOME=$HF_DATASETS_CACHE

export SLURM_CPU_BIND="cores"
module load PrgEnv-gnu cray-mpich cudatoolkit craype-accel-nvidia80 python
conda activate gpu-aware-mpi
export MPICH_GPU_SUPPORT_ENABLED=1

# -------------------- GENERATE TIMINGS ------------------------------ (not accuracy/long runs)
BATCH_SIZE=128
EPOCHS=2
PDATA=2500

# First generate the model; will auto leave with serial file
#srun -n 8 python main.py --percent-data=$PDATA --steps 32 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1

# First generate the model; will auto leave with serial file
#srun -n 8 python main.py --percent-data=$PDATA --steps 64 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1

# First generate the model; will auto leave with serial file
srun -n 8 python main.py --percent-data=$PDATA --steps 128 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1

srun -n 8 python main.py --percent-data=$PDATA --steps 192 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1
# ----------------------------------------------------------


# https://docs.nersc.gov/development/languages/python/parallel-python/#mpi4py
# "applications using mpi4py must be launched via srun"

#srun ./select_gpu_device python test-gpu-aware-mpi.py
#srun -n 2 -c 32 python main.py --steps 12 --channels 8 --batch-size 50 --log-interval 100 --epochs 20
# srun python main.py
# srun --ntasks 4 --gpus-per-task 1 -c 32 --gpu-bind=none python main_noDP.py > 4.out
41 changes: 36 additions & 5 deletions examples/bert-training/src/pl_serial.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,55 @@
#SBATCH -A m1327
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH --time=00:10:00
#SBATCH --time=00:25:00
#SBATCH -N 1 # Nodes
#SBATCH --ntasks-per-node=4
#SBATCh -c 32 # https://docs.nersc.gov/systems/perlmutter/running-jobs/#1-node-4-tasks-4-gpus-1-gpu-visible-to-each-task
#SBATCH --ntasks-per-node=1
#SBATCh -c 128 # https://docs.nersc.gov/systems/perlmutter/running-jobs/#1-node-4-tasks-4-gpus-1-gpu-visible-to-each-task
#SBATCH --gpus-per-task=1 # Number of GPUs per MPI task; each rank has one
#SBATCH --gpu-bind=none
#SBATCH --output=ml_serial.out
#SBATCH --error=ml_serial.err


export SLURM_CPU_BIND="cores"
export HF_DATASETS_CACHE=$SCRATCH/huggingface_cache
export HF_HOME=$HF_DATASETS_CACHE

export SLURM_CPU_BIND="cores"
module load PrgEnv-gnu cray-mpich cudatoolkit craype-accel-nvidia80 python
conda activate gpu-aware-mpi
export MPICH_GPU_SUPPORT_ENABLED=1

# -------------------- GENERATE TIMINGS ------------------------------ (not accuracy/long runs)
BATCH_SIZE=128
EPOCHS=2
PDATA=2500

# First generate the model; will auto leave with serial file
#srun -n 1 python main.py --serial-file True --percent-data=$PDATA --steps 32 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6
#python main_serial.py --percent-data=$PDATA --steps 32 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6
#rm serialnet_bert_32

# First generate the model; will auto leave with serial file
#srun -n 1 python main.py --serial-file True --percent-data=$PDATA --steps 64 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6
#python main_serial.py --percent-data=$PDATA --steps 64 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6
#rm serialnet_bert_64

# First generate the model; will auto leave with serial file
srun -n 1 python main.py --serial-file True --percent-data=$PDATA --steps 128 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6
python main_serial.py --percent-data=$PDATA --steps 128 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6
rm serialnet_bert_128

srun -n 1 python main.py --serial-file True --percent-data=$PDATA --steps 192 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6
python main_serial.py --percent-data=$PDATA --steps 192 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6
rm serialnet_bert_192

# ----------------------------------------------------------


# https://docs.nersc.gov/development/languages/python/parallel-python/#mpi4py
# "applications using mpi4py must be launched via srun"

#srun ./select_gpu_device python test-gpu-aware-mpi.py
#srun -n 2 -c 32 python main.py --steps 12 --channels 8 --batch-size 50 --log-interval 100 --epochs 20
srun python main.py
# srun python main.py
# srun --ntasks 4 --gpus-per-task 1 -c 32 --gpu-bind=none python main_noDP.py > 4.out

0 comments on commit 0eae356

Please sign in to comment.