diff --git a/examples/bert-training/src/get_dataset.py b/examples/bert-training/src/get_dataset.py index cd024ab..02ed5c0 100644 --- a/examples/bert-training/src/get_dataset.py +++ b/examples/bert-training/src/get_dataset.py @@ -140,8 +140,8 @@ def obtain_dataset(percent_data:float = 0.01, seq_len: int = 128): split = f'train[:{int(percent_data)}]' else: split = f'train[:{int(percent_data * 100)}%]' - bookcorpus_train = load_dataset('bookcorpus', split=split) - wiki_train = load_dataset("wikipedia", "20220301.simple", split=split) + bookcorpus_train = load_dataset('bookcorpus', split=split, trust_remote_code=True) + wiki_train = load_dataset("wikipedia", "20220301.simple", split=split, trust_remote_code=True) # bookcorpus_train = load_dataset('bookcorpus', split=f'train[0:25000]') # wiki_train = load_dataset("wikipedia", "20220301.simple", split=f'train[0:25000]') @@ -172,4 +172,4 @@ def filter_short(examples): ) # print(tokenized_datasets) - return MyBERTDataset(tokenized_datasets, tokenizer, seq_len), tokenizer.vocab_size \ No newline at end of file + return MyBERTDataset(tokenized_datasets, tokenizer, seq_len), tokenizer.vocab_size diff --git a/examples/bert-training/src/main.py b/examples/bert-training/src/main.py index bcec471..919fd0b 100644 --- a/examples/bert-training/src/main.py +++ b/examples/bert-training/src/main.py @@ -263,7 +263,7 @@ def test(rank, model, test_loader, compose, device): # Parallel printing helper function def root_print(rank, s): if rank == 0: - print(s) + print(s, flush='True') class ScheduledOptim(): '''A simple wrapper class for learning rate scheduling''' diff --git a/examples/bert-training/src/pl_n16.sh b/examples/bert-training/src/pl_n16.sh new file mode 100644 index 0000000..5de1089 --- /dev/null +++ b/examples/bert-training/src/pl_n16.sh @@ -0,0 +1,47 @@ +#!/bin/bash +#SBATCH -A m1327 +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH --time=00:30:00 +#SBATCH -N 4 # Nodes +#SBATCH --ntasks-per-node=4 +#SBATCh -c 32 # https://docs.nersc.gov/systems/perlmutter/running-jobs/#1-node-4-tasks-4-gpus-1-gpu-visible-to-each-task +#SBATCH --gpus-per-task=1 # Number of GPUs per MPI task; each rank has one +#SBATCH --gpu-bind=none +#SBATCH --output=ml_n16.out +#SBATCH --error=ml_n16.err + + +export HF_DATASETS_CACHE=$SCRATCH/huggingface_cache +export HF_HOME=$HF_DATASETS_CACHE + +export SLURM_CPU_BIND="cores" +module load PrgEnv-gnu cray-mpich cudatoolkit craype-accel-nvidia80 python +conda activate gpu-aware-mpi +export MPICH_GPU_SUPPORT_ENABLED=1 + +# -------------------- GENERATE TIMINGS ------------------------------ (not accuracy/long runs) +BATCH_SIZE=128 +EPOCHS=2 +PDATA=2500 + +# First generate the model; will auto leave with serial file +#srun -n 8 python main.py --percent-data=$PDATA --steps 32 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 + +# First generate the model; will auto leave with serial file +#srun -n 8 python main.py --percent-data=$PDATA --steps 64 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 + +# First generate the model; will auto leave with serial file +srun -n 16 python main.py --percent-data=$PDATA --steps 128 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 + +srun -n 16 python main.py --percent-data=$PDATA --steps 192 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 +# ---------------------------------------------------------- + + +# https://docs.nersc.gov/development/languages/python/parallel-python/#mpi4py +# "applications using mpi4py must be launched via srun" + +#srun ./select_gpu_device python test-gpu-aware-mpi.py +#srun -n 2 -c 32 python main.py --steps 12 --channels 8 --batch-size 50 --log-interval 100 --epochs 20 +# srun python main.py +# srun --ntasks 4 --gpus-per-task 1 -c 32 --gpu-bind=none python main_noDP.py > 4.out diff --git a/examples/bert-training/src/pl_n2.sh b/examples/bert-training/src/pl_n2.sh new file mode 100644 index 0000000..b0871a8 --- /dev/null +++ b/examples/bert-training/src/pl_n2.sh @@ -0,0 +1,47 @@ +#!/bin/bash +#SBATCH -A m1327 +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH --time=00:30:00 +#SBATCH -N 1 # Nodes +#SBATCH --ntasks-per-node=2 +#SBATCh -c 64 # https://docs.nersc.gov/systems/perlmutter/running-jobs/#1-node-4-tasks-4-gpus-1-gpu-visible-to-each-task +#SBATCH --gpus-per-task=1 # Number of GPUs per MPI task; each rank has one +#SBATCH --gpu-bind=none +#SBATCH --output=ml_n2.out +#SBATCH --error=ml_n2.err + + +export HF_DATASETS_CACHE=$SCRATCH/huggingface_cache +export HF_HOME=$HF_DATASETS_CACHE + +export SLURM_CPU_BIND="cores" +module load PrgEnv-gnu cray-mpich cudatoolkit craype-accel-nvidia80 python +conda activate gpu-aware-mpi +export MPICH_GPU_SUPPORT_ENABLED=1 + +# -------------------- GENERATE TIMINGS ------------------------------ (not accuracy/long runs) +BATCH_SIZE=128 +EPOCHS=2 +PDATA=2500 + +# First generate the model; will auto leave with serial file +#srun -n 2 python main.py --percent-data=$PDATA --steps 32 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 + +# First generate the model; will auto leave with serial file +#srun -n 2 python main.py --percent-data=$PDATA --steps 64 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 + +# First generate the model; will auto leave with serial file +srun -n 2 python main.py --percent-data=$PDATA --steps 128 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 + +srun -n 2 python main.py --percent-data=$PDATA --steps 192 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 +# ---------------------------------------------------------- + + +# https://docs.nersc.gov/development/languages/python/parallel-python/#mpi4py +# "applications using mpi4py must be launched via srun" + +#srun ./select_gpu_device python test-gpu-aware-mpi.py +#srun -n 2 -c 32 python main.py --steps 12 --channels 8 --batch-size 50 --log-interval 100 --epochs 20 +# srun python main.py +# srun --ntasks 4 --gpus-per-task 1 -c 32 --gpu-bind=none python main_noDP.py > 4.out diff --git a/examples/bert-training/src/pl_n4.sh b/examples/bert-training/src/pl_n4.sh new file mode 100644 index 0000000..f1c3f0c --- /dev/null +++ b/examples/bert-training/src/pl_n4.sh @@ -0,0 +1,47 @@ +#!/bin/bash +#SBATCH -A m1327 +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH --time=00:25:00 +#SBATCH -N 1 # Nodes +#SBATCH --ntasks-per-node=4 +#SBATCh -c 32 # https://docs.nersc.gov/systems/perlmutter/running-jobs/#1-node-4-tasks-4-gpus-1-gpu-visible-to-each-task +#SBATCH --gpus-per-task=1 # Number of GPUs per MPI task; each rank has one +#SBATCH --gpu-bind=none # This seems important; idk why +#SBATCH --output=ml_n4.out +#SBATCH --error=ml_n4.err + + +export HF_DATASETS_CACHE=$SCRATCH/huggingface_cache +export HF_HOME=$HF_DATASETS_CACHE + +export SLURM_CPU_BIND="cores" +module load PrgEnv-gnu cray-mpich cudatoolkit craype-accel-nvidia80 python +conda activate gpu-aware-mpi +export MPICH_GPU_SUPPORT_ENABLED=1 + +# -------------------- GENERATE TIMINGS ------------------------------ (not accuracy/long runs) +BATCH_SIZE=128 +EPOCHS=2 +PDATA=2500 + +# First generate the model; will auto leave with serial file +#srun -n 4 python main.py --percent-data=$PDATA --steps 32 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 + +# First generate the model; will auto leave with serial file +#srun -n 4 python main.py --percent-data=$PDATA --steps 64 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 + +# First generate the model; will auto leave with serial file +srun -n 4 python main.py --percent-data=$PDATA --steps 128 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 + +srun -n 4 python main.py --percent-data=$PDATA --steps 192 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 +# ---------------------------------------------------------- + + +# https://docs.nersc.gov/development/languages/python/parallel-python/#mpi4py +# "applications using mpi4py must be launched via srun" + +#srun ./select_gpu_device python test-gpu-aware-mpi.py +#srun -n 2 -c 32 python main.py --steps 12 --channels 8 --batch-size 50 --log-interval 100 --epochs 20 +# srun python main.py +# srun --ntasks 4 --gpus-per-task 1 -c 32 --gpu-bind=none python main_noDP.py > 4.out diff --git a/examples/bert-training/src/pl_n8.sh b/examples/bert-training/src/pl_n8.sh new file mode 100644 index 0000000..d93328a --- /dev/null +++ b/examples/bert-training/src/pl_n8.sh @@ -0,0 +1,47 @@ +#!/bin/bash +#SBATCH -A m1327 +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH --time=00:25:00 +#SBATCH -N 2 # Nodes +#SBATCH --ntasks-per-node=4 +#SBATCh -c 32 # https://docs.nersc.gov/systems/perlmutter/running-jobs/#1-node-4-tasks-4-gpus-1-gpu-visible-to-each-task +#SBATCH --gpus-per-task=1 # Number of GPUs per MPI task; each rank has one +#SBATCH --gpu-bind=none +#SBATCH --output=ml_n8.out +#SBATCH --error=ml_n8.err + + +export HF_DATASETS_CACHE=$SCRATCH/huggingface_cache +export HF_HOME=$HF_DATASETS_CACHE + +export SLURM_CPU_BIND="cores" +module load PrgEnv-gnu cray-mpich cudatoolkit craype-accel-nvidia80 python +conda activate gpu-aware-mpi +export MPICH_GPU_SUPPORT_ENABLED=1 + +# -------------------- GENERATE TIMINGS ------------------------------ (not accuracy/long runs) +BATCH_SIZE=128 +EPOCHS=2 +PDATA=2500 + +# First generate the model; will auto leave with serial file +#srun -n 8 python main.py --percent-data=$PDATA --steps 32 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 + +# First generate the model; will auto leave with serial file +#srun -n 8 python main.py --percent-data=$PDATA --steps 64 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 + +# First generate the model; will auto leave with serial file +srun -n 8 python main.py --percent-data=$PDATA --steps 128 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 + +srun -n 8 python main.py --percent-data=$PDATA --steps 192 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --lp-max-levels 2 --lp-bwd-max-iters 1 --lp-fwd-max-iters 2 --lp-cfactor 4 --model_dimension 384 --num_heads 6 --lp-print-level 0 --lp-braid-print-level 0 --Tf 1 +# ---------------------------------------------------------- + + +# https://docs.nersc.gov/development/languages/python/parallel-python/#mpi4py +# "applications using mpi4py must be launched via srun" + +#srun ./select_gpu_device python test-gpu-aware-mpi.py +#srun -n 2 -c 32 python main.py --steps 12 --channels 8 --batch-size 50 --log-interval 100 --epochs 20 +# srun python main.py +# srun --ntasks 4 --gpus-per-task 1 -c 32 --gpu-bind=none python main_noDP.py > 4.out diff --git a/examples/bert-training/src/pl_serial.sh b/examples/bert-training/src/pl_serial.sh index f495fa6..febba37 100644 --- a/examples/bert-training/src/pl_serial.sh +++ b/examples/bert-training/src/pl_serial.sh @@ -2,24 +2,55 @@ #SBATCH -A m1327 #SBATCH -C gpu #SBATCH -q regular -#SBATCH --time=00:10:00 +#SBATCH --time=00:25:00 #SBATCH -N 1 # Nodes -#SBATCH --ntasks-per-node=4 -#SBATCh -c 32 # https://docs.nersc.gov/systems/perlmutter/running-jobs/#1-node-4-tasks-4-gpus-1-gpu-visible-to-each-task +#SBATCH --ntasks-per-node=1 +#SBATCh -c 128 # https://docs.nersc.gov/systems/perlmutter/running-jobs/#1-node-4-tasks-4-gpus-1-gpu-visible-to-each-task #SBATCH --gpus-per-task=1 # Number of GPUs per MPI task; each rank has one #SBATCH --gpu-bind=none +#SBATCH --output=ml_serial.out +#SBATCH --error=ml_serial.err -export SLURM_CPU_BIND="cores" +export HF_DATASETS_CACHE=$SCRATCH/huggingface_cache +export HF_HOME=$HF_DATASETS_CACHE +export SLURM_CPU_BIND="cores" module load PrgEnv-gnu cray-mpich cudatoolkit craype-accel-nvidia80 python conda activate gpu-aware-mpi export MPICH_GPU_SUPPORT_ENABLED=1 +# -------------------- GENERATE TIMINGS ------------------------------ (not accuracy/long runs) +BATCH_SIZE=128 +EPOCHS=2 +PDATA=2500 + +# First generate the model; will auto leave with serial file +#srun -n 1 python main.py --serial-file True --percent-data=$PDATA --steps 32 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6 +#python main_serial.py --percent-data=$PDATA --steps 32 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6 +#rm serialnet_bert_32 + +# First generate the model; will auto leave with serial file +#srun -n 1 python main.py --serial-file True --percent-data=$PDATA --steps 64 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6 +#python main_serial.py --percent-data=$PDATA --steps 64 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6 +#rm serialnet_bert_64 + +# First generate the model; will auto leave with serial file +srun -n 1 python main.py --serial-file True --percent-data=$PDATA --steps 128 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6 +python main_serial.py --percent-data=$PDATA --steps 128 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6 +rm serialnet_bert_128 + +srun -n 1 python main.py --serial-file True --percent-data=$PDATA --steps 192 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6 +python main_serial.py --percent-data=$PDATA --steps 192 --epochs=$EPOCHS --batch-size=$BATCH_SIZE --model_dimension 384 --num_heads 6 +rm serialnet_bert_192 + +# ---------------------------------------------------------- + + # https://docs.nersc.gov/development/languages/python/parallel-python/#mpi4py # "applications using mpi4py must be launched via srun" #srun ./select_gpu_device python test-gpu-aware-mpi.py #srun -n 2 -c 32 python main.py --steps 12 --channels 8 --batch-size 50 --log-interval 100 --epochs 20 -srun python main.py +# srun python main.py # srun --ntasks 4 --gpus-per-task 1 -c 32 --gpu-bind=none python main_noDP.py > 4.out