From a6ba3b3b1e2b730c40528dd3acf41d3e300cd8ea Mon Sep 17 00:00:00 2001 From: Ankur Srivastava Date: Fri, 16 Feb 2024 16:39:59 -0800 Subject: [PATCH 1/6] Added BioNemo on ParallelCluster Signed-off-by: Ankur Srivastava --- 3.test_cases/14.bionemo/0.Dockerfile | 107 +++++++++++ 3.test_cases/14.bionemo/1.uniref50.slurm | 24 +++ .../14.bionemo/2.esm1nv_pretrain.slurm | 80 +++++++++ 3.test_cases/14.bionemo/README.md | 169 ++++++++++++++++++ 3.test_cases/14.bionemo/prepare_uniref50.py | 3 + 3.test_cases/14.bionemo/requirements.txt | 10 ++ 6 files changed, 393 insertions(+) create mode 100644 3.test_cases/14.bionemo/0.Dockerfile create mode 100644 3.test_cases/14.bionemo/1.uniref50.slurm create mode 100644 3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm create mode 100644 3.test_cases/14.bionemo/README.md create mode 100644 3.test_cases/14.bionemo/prepare_uniref50.py create mode 100644 3.test_cases/14.bionemo/requirements.txt diff --git a/3.test_cases/14.bionemo/0.Dockerfile b/3.test_cases/14.bionemo/0.Dockerfile new file mode 100644 index 00000000..8d99f1ec --- /dev/null +++ b/3.test_cases/14.bionemo/0.Dockerfile @@ -0,0 +1,107 @@ +FROM nvcr.io/nvidia/clara/bionemo-framework:latest + +ARG EFA_INSTALLER_VERSION=latest +ARG AWS_OFI_NCCL_VERSION=v1.7.3-aws +ARG NCCL_TESTS_VERSION=master +ARG NCCL_VERSION=v2.18.5-1 +RUN apt-get update -y +RUN apt-get remove -y --allow-change-held-packages \ + libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev + +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + git \ + gcc \ + vim \ + kmod \ + openssh-client \ + openssh-server \ + build-essential \ + curl \ + autoconf \ + libtool \ + gdb \ + automake \ + python3-distutils \ + cmake \ + apt-utils \ + devscripts \ + debhelper \ + libsubunit-dev \ + check \ + pkg-config + +RUN mkdir -p /var/run/sshd +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config +ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH +ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH +RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \ + && python3 /tmp/get-pip.py \ + && pip3 install awscli pynvml + +################################################# +# Install NVIDIA GDRCopy +RUN git clone https://github.com/NVIDIA/gdrcopy.git /opt/gdrcopy \ + && cd /opt/gdrcopy \ + && make lib_install install \ + && cd /opt/gdrcopy/tests \ + && make \ + && mv gdrcopy_copylat gdrcopy_copybw gdrcopy_sanity gdrcopy_apiperf /usr/bin/ + +################################################# +## Install EFA installer +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && rm -rf $HOME/aws-efa-installer + +################################################### +## Install NCCL +RUN git clone https://github.com/NVIDIA/nccl -b ${NCCL_VERSION} /opt/nccl \ + && cd /opt/nccl \ + && make -j src.build CUDA_HOME=/usr/local/cuda \ + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90" + +################################################### +## Install AWS-OFI-NCCL plugin +RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y +RUN export OPAL_PREFIX="" \ + && git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \ + && cd /opt/aws-ofi-nccl \ + && git checkout ${AWS_OFI_NCCL_VERSION} \ + && ./autogen.sh \ + && ./configure --prefix=/opt/aws-ofi-nccl/install \ + --with-libfabric=/opt/amazon/efa/ \ + --with-cuda=/usr/local/cuda \ + --with-nccl=/opt/nccl/build \ + --with-mpi=/opt/amazon/openmpi/ \ + && make && make install + +################################################### +## Install NCCL-tests +RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ + && cd /opt/nccl-tests \ + && git checkout ${NCCL_TESTS_VERSION} \ + && make MPI=1 \ + MPI_HOME=/opt/amazon/openmpi/ \ + CUDA_HOME=/usr/local/cuda \ + NCCL_HOME=/opt/nccl/build \ + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90" + + + +RUN rm -rf /var/lib/apt/lists/* +ENV LD_PRELOAD /opt/nccl/build/lib/libnccl.so + + +############################################## +## BioNemo dependencies +COPY requirements.txt /workspace/ +RUN pip3 install -r /workspace/requirements.txt + +COPY prepare_uniref50.py /workspace/bionemo + +WORKDIR /workspace/bionemo/ \ No newline at end of file diff --git a/3.test_cases/14.bionemo/1.uniref50.slurm b/3.test_cases/14.bionemo/1.uniref50.slurm new file mode 100644 index 00000000..92cbda39 --- /dev/null +++ b/3.test_cases/14.bionemo/1.uniref50.slurm @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --nodes=1 # number of nodes +#SBATCH --ntasks-per-node=1 # n tasks per machine (one task per gpu) +#SBATCH --gpus-per-node=8 +#SBATCH --exclusive # exclusive node access +#SBATCH --output slurm-uniref-%j.out + + +########################### +###### User Variables ##### +########################### + +# default variables for Enroot +: "${IMAGE:=$(pwd)/bionemo.sqsh}" +: "${DATA_PATH:=/fsx}" +: "${FSX_MOUNT:=$DATA_PATH:$DATA_PATH}" + +declare -a ARGS=( + --container-image $IMAGE + --container-mount-home + --container-mounts $FSX_MOUNT +) + +srun -l "${ARGS[@]}" python3 /workspace/bionemo/prepare_uniref50.py diff --git a/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm b/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm new file mode 100644 index 00000000..ec56996f --- /dev/null +++ b/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm @@ -0,0 +1,80 @@ +#!/bin/bash +#SBATCH --nodes=4 # number of nodes +#SBATCH --ntasks-per-node=8 # n tasks per machine (one task per gpu) +#SBATCH --gpus-per-node=8 +#SBATCH --exclusive # exclusive node access +#SBATCH --output slurm-esm1nv-train-%j.out + + +########################### +###### User Variables ##### +########################### + +# default variables for Enroot +: "${IMAGE:=$(pwd)/bionemo.sqsh}" +: "${DATA_PATH:=/fsx}" +: "${FSX_MOUNT:=$DATA_PATH:$DATA_PATH}" + +declare -a ARGS=( + --container-image $IMAGE + --container-mount-home + --container-mounts $FSX_MOUNT +) + + +# Training parameters +# ========================= +MICRO_BATCH_SIZE=256 # micro batch size per GPU, for best efficiency should be set to occupy ~85% of GPU memory. Suggested value for A100 80GB is 256 +ACCUMULATE_GRAD_BATCHES=1 # gradient accumulation +TENSOR_MODEL_PARALLEL_SIZE=1 # tensor model parallel size +VAL_CHECK_INTERVAL=500 # how often validation step is performed, including downstream task validation +MAX_STEPS=1000000 # duration of training as the number of training steps +# ========================= + + +# Logging +# ========================= +PROJECT_NAME="esm1nv_pretraining" # project name, will be used for logging +EXP_TAG="-small" # any additional experiment info, can be empty +EXP_NAME="esm1nv_batch${MICRO_BATCH_SIZE}_gradacc${ACCUMULATE_GRAD_BATCHES}_nodes${SLURM_JOB_NUM_NODES}${EXP_TAG}" +CREATE_WANDB_LOGGER=False # set to False if you don't want to log results with WandB +WANDB_LOGGER_OFFLINE=False # set to True if there are issues uploading to WandB during training +# ========================= + +# Mounts +# ========================= +DATA_PATH=/fsx/processed # Directory with data for model training and downstream task validation +TRAIN_FILES='x_OP_000..049_CL_' # Range for the train dataset +TEST_FILES='x_OP_000..049_CL_' # Range for the test dataset +VAL_FILES='x_OP_000..049_CL_' # Range for the val dataset +RESULTS_PATH=/fsx/esm1nv-train/${PROJECT_NAME}/${EXP_NAME}/results # directory to store logs, checkpoints and results + +mkdir -p ${RESULTS_PATH}} + + +# Necessary Exports +# ========================= +export HYDRA_FULL_ERROR=1 +# ========================= + +srun -l "${ARGS[@]}" python3 /workspace/bionemo/examples/protein/esm1nv/pretrain.py \ + --config-path=/workspace/bionemo/examples/protein/esm1nv/conf \ + --config-name=pretrain_small \ + exp_manager.exp_dir=${RESULTS_PATH} \ + exp_manager.create_wandb_logger=${CREATE_WANDB_LOGGER} \ + exp_manager.wandb_logger_kwargs.name=${EXP_NAME} \ + exp_manager.wandb_logger_kwargs.project=${PROJECT_NAME} \ + ++exp_manager.wandb_logger_kwargs.offline=${WANDB_LOGGER_OFFLINE} \ + trainer.num_nodes=${SLURM_JOB_NUM_NODES} \ + trainer.devices=${SLURM_GPUS_PER_NODE} \ + trainer.max_steps=${MAX_STEPS} \ + trainer.accumulate_grad_batches=${ACCUMULATE_GRAD_BATCHES} \ + trainer.val_check_interval=${VAL_CHECK_INTERVAL} \ + model.micro_batch_size=${MICRO_BATCH_SIZE} \ + model.tensor_model_parallel_size=${TENSOR_MODEL_PARALLEL_SIZE} \ + model.data.dataset_path=${DATA_PATH} \ + model.data.dataset.train=${TRAIN_FILES} \ + model.data.dataset.val=${VAL_FILES} \ + model.data.dataset.test=${TEST_FILES} \ + model.data.index_mapping_dir=${DATA_PATH} \ + ++model.dwnstr_task_validation.dataset.dataset_path=/workspace/bionemo/examples/tests/test_data/protein/downstream diff --git a/3.test_cases/14.bionemo/README.md b/3.test_cases/14.bionemo/README.md new file mode 100644 index 00000000..bd669894 --- /dev/null +++ b/3.test_cases/14.bionemo/README.md @@ -0,0 +1,169 @@ +# Train Evolutionary Scale Models (ESM) with BioNemo + +NVIDIA BioNeMo is a domain-specific machine learning framework for training and using foundation models for biology. This includes models for analyzing proteins, small molecules, and other biological molecules. NVIDIA first announced it in [September 2022](https://nvidianews.nvidia.com/news/nvidia-launches-large-language-model-cloud-services-to-advance-ai-and-digital-biology) and released a more comprehensive version on DGX cloud at [GTC 2023](https://nvidianews.nvidia.com/news/nvidia-unveils-large-language-models-and-generative-ai-services-to-advance-life-sciences-r-d). The GTC 2023 release included two main capabilities: +1. A NeMo-based training framework to enable ML teams to create training and inference jobs via Python scripts. submitted via DGX-hosted notebooks +2. A web application that enabled scientists to create inference jobs and visualize output data. + +At GTC 2023, BioNeMo supported 9 models: +MegaMolBART +ESM-1nv +OpenFold +AlphaFold2 +DiffDock +ESMFold +ESM-2nv +MoFlow +ProtGPT-2 +ProtT5nv + +Since then, NVIDIA has also announced support for three additional models +EquiDock +MolMIM +DiffDock + +This project provides a guide to run [Nvidia's BioNemo](https://docs.nvidia.com/bionemo-framework/latest/index.html) on AWS ParallelCluster and pretrain the popular [ESM models](https://github.com/facebookresearch/esm) specifically the [ESM1nv](https://docs.nvidia.com/bionemo-framework/latest/notebooks/model_training_esm1nv.html) model. + + +## 0. Prerequisites + +0. You have access to the bionemo container.You can get access to the container from NGC. You may also follow the instructions provided [here](https://docs.nvidia.com/bionemo-framework/latest/quickstart-fw.html) +1. Have a slurm based parallelcluster created with a FSx for Lustre filesystem mounted. + +## 1. Install NGC CLI and Login + +Follow the steps below to install the NGC CLI and login to NGC Container Registry. This is needed before you can pull the BioNemo container. + +0. Generate API Key: https://ngc.nvidia.com/setup/api-key +1. Install NGC CLI: https://ngc.nvidia.com/setup/installers/cli +2. Login +``` +docker login nvcr.io +Username: $oauthtoken +Password: API_KEY +``` +Please make note that the Username is exactly `"$oauthtoken"`. + +## 2. Install Nvidia Container CLI + +### 2.1 If you have created your cluster with [DLAMI](https://aws.amazon.com/machine-learning/amis/) or your custom AMI, please make sure `libnvidia-container cli` is installed. You can follow the instructions below to install it. + +### 2.2 To install libnvidia-container cli: +https://github.com/NVIDIA/libnvidia-container +https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html + +``` +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ + && \ + sudo apt-get update \ + && sudo apt-get install libnvidia-container1 \ + && sudo apt-get install libnvidia-container-tools +``` +### 2.3 You can set the Nemo Multimodal version and others as environment variables: + +``` +export PYTHON_VERSION=3.10 +# We are using Python version 3.10 in this work. For a different Python version select the right Miniconda file from https://repo.anaconda.com/miniconda/ +export MINICONDA_INSTALLER=Miniconda3-py310_23.5.2-0-Linux-x86_64 +export TARGET_PATH=/apps/bionemo-src # Must be a shared filesystem. This is where Nemo launcher scripts will reside. +export DOCKER_IMAGE_NAME=bionemo +export TAG=latest +export ENROOT_IMAGE=/apps/${DOCKER_IMAGE_NAME} +export DATASET_PATH=/fsx/ +``` + +## 2.4. Pull this github repo + +```bash +cd /apps/ +git clone https://github.com/aws-samples/awsome-distributed-training.git +cd awsome-distributed-training/3.test_cases/15.bionemo +``` + +## 3. Pull Image +SSH into the head node of your cluster and run + +``` +cd /apps/ +docker pull nvcr.io/nvidia/clara/bionemo-framework:latest +``` + +## 4. Run container on Head Node [Optional] +Once the above image is pulled, you can run the container on the head node like below. Here we are running the container just to be able to copy launcher scripts on the host machine. If you need to run the container on the compute nodes, you would need to add `--gpus all` flag to the run command. It is recommended to have the docker run flags like below, as recommended by Nvidia PyTorch containers, otherwise you may potentially run into an error like [this](https://github.com/NVIDIA/Megatron-LM/issues/516) + +``` + docker run -it nvcr.io/nvidia/clara/bionemo-framework:latest bash +``` + +## 5. Create Conda env +We need a conda environment that has the necessary dependencies for submitting multiple arrays of slurm jobs via [HYDRA](https://github.com/facebookresearch/hydra) which NeMo uses to configuring both NeMo models and the PyTorch Lightning Trainer. +``` +wget -O miniconda.sh "https://repo.anaconda.com/miniconda/${MINICONDA_INSTALLER}.sh" \ + && bash miniconda.sh -b -p /apps/.conda \ + && /apps/.conda/bin/conda init bash + +source /home/ubuntu/.bashrc +conda create --name bionemo python=${PYTHON_VERSION} + +source activate bionemo + +pip3 install -r requirements.txt + +``` +All package versions in the above `requirements.txt` file is recommended from Nvidia. An older version of the package `opencv-python-headless==4.8.0.74` has to be installed to avoid this [error](https://github.com/rom1504/img2dataset/issues/355) with [img2dataset](https://github.com/rom1504/img2dataset) package. + + + +## 6. Build customized docker image +To achieve target performance of Nemo-Multimodal with EFA on P5 and P4de instances, we provide a customized +`3.test_cases/14.nemo-multimodal/0.Dockerfile` and we can build a image like below: + +``` +docker build -t ${DOCKER_IMAGE_NAME}:${TAG} -f 0.Dockerfile . +``` + +## 7. Convert image +Convert the Docker container image to an [Enroot](https://github.com/NVIDIA/enroot) squash file that will be stored in `/apps`. This step takes a few minutes. +``` +enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${DOCKER_IMAGE_NAME} + +``` + +## 8. Download and preprocess data +We will use the popular [UniRef50](https://www.uniprot.org/help/uniref) dataset for pretraining. We will use BioNemo's in-built functionality to download and pre-process data. To this end, we provide `prepare_uniref50.py` file to do so like below: + +```python +from bionemo.data import UniRef50Preprocess +data = UniRef50Preprocess(root_directory='/fsx') +data.prepare_dataset(source='uniprot') +``` + +You can edit the above to download and process [UniRef90]((https://www.uniprot.org/help/uniref)). To run the above python code on your slurm cluster in the BioNemo cluster execute the following: + +```bash +sbatch 1.uniref50.slurm +``` + +This will download raw data in `/fsx/raw/` and save pre-processed `train, validation and test` csv files in `/fsx/processed/` + +## 9. Pretrain ESM models +Now we are ready to submit distributed training jobs to pretrain `ESM1nv` models. We provide the `2.esm1nv_pretrain.slurm` script to run training 4 `p4de.24xlarge` nodes with `8xA100 80 GB` GPUs. Make sure data paths and model configuration is correct if you are running on custom data. To kick off distributed training execute: + +```bash +sbatch 2.esm1nv_pretrain.slurm + +``` + +Before kicking off training, first train, validation and test datasets are indexed and dataloaders are created and then you should see an example output like below: + +```bash +Epoch 0: 0%| | 5393/1100000 [56:21<190:37:22, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.640, global_step=4942.0, consumed_samples=4. +Epoch 0: 0%| | 5394/1100000 [56:21<190:37:00, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.640, global_step=4942.0, consumed_samples=4. +Epoch 0: 0%| | 5394/1100000 [56:21<190:37:00, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.640, global_step=4943.0, consumed_samples=4. +Epoch 0: 0%| | 5395/1100000 [56:22<190:36:32, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.640, global_step=4943.0, consumed_samples=4.Epoch 0: 0%| | 5395/1100000 [56:22<190:36:32, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.650, global_step=4944.0, consumed_samples=4. +``` + + + diff --git a/3.test_cases/14.bionemo/prepare_uniref50.py b/3.test_cases/14.bionemo/prepare_uniref50.py new file mode 100644 index 00000000..d8a6c9d5 --- /dev/null +++ b/3.test_cases/14.bionemo/prepare_uniref50.py @@ -0,0 +1,3 @@ +from bionemo.data import UniRef50Preprocess +data = UniRef50Preprocess(root_directory='/fsx') +data.prepare_dataset(source='uniprot') \ No newline at end of file diff --git a/3.test_cases/14.bionemo/requirements.txt b/3.test_cases/14.bionemo/requirements.txt new file mode 100644 index 00000000..fdbba4ea --- /dev/null +++ b/3.test_cases/14.bionemo/requirements.txt @@ -0,0 +1,10 @@ +dask +huggingface_hub>=0.13.0 +hydra-core>=1.2.0,<1.3 +img2dataset +omegaconf>=2.2,<2.3 +pynvml==11.4.1 +requests==2.31.0 +tqdm==4.62.3 +zstandard==0.15.2 +opencv-python-headless==4.8.0.74 \ No newline at end of file From 9cdbf43e8718d45f7a44f06f9a6686f73d09ef1c Mon Sep 17 00:00:00 2001 From: Ankur Srivastava <101727556+awsankur@users.noreply.github.com> Date: Fri, 16 Feb 2024 16:41:52 -0800 Subject: [PATCH 2/6] Update README.md --- 3.test_cases/14.bionemo/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/3.test_cases/14.bionemo/README.md b/3.test_cases/14.bionemo/README.md index bd669894..2b53a915 100644 --- a/3.test_cases/14.bionemo/README.md +++ b/3.test_cases/14.bionemo/README.md @@ -162,7 +162,8 @@ Before kicking off training, first train, validation and test datasets are index Epoch 0: 0%| | 5393/1100000 [56:21<190:37:22, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.640, global_step=4942.0, consumed_samples=4. Epoch 0: 0%| | 5394/1100000 [56:21<190:37:00, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.640, global_step=4942.0, consumed_samples=4. Epoch 0: 0%| | 5394/1100000 [56:21<190:37:00, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.640, global_step=4943.0, consumed_samples=4. -Epoch 0: 0%| | 5395/1100000 [56:22<190:36:32, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.640, global_step=4943.0, consumed_samples=4.Epoch 0: 0%| | 5395/1100000 [56:22<190:36:32, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.650, global_step=4944.0, consumed_samples=4. +Epoch 0: 0%| | 5395/1100000 [56:22<190:36:32, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.640, global_step=4943.0, consumed_samples=4. +Epoch 0: 0%| | 5395/1100000 [56:22<190:36:32, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.650, global_step=4944.0, consumed_samples=4. ``` From 8d31fe91437429826e5fe0c69ccca1ec5ed7e52e Mon Sep 17 00:00:00 2001 From: Ankur Srivastava <101727556+awsankur@users.noreply.github.com> Date: Fri, 16 Feb 2024 21:23:55 -0800 Subject: [PATCH 3/6] Update README.md --- 3.test_cases/14.bionemo/README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/3.test_cases/14.bionemo/README.md b/3.test_cases/14.bionemo/README.md index 2b53a915..be2844b6 100644 --- a/3.test_cases/14.bionemo/README.md +++ b/3.test_cases/14.bionemo/README.md @@ -79,7 +79,7 @@ export DATASET_PATH=/fsx/ ```bash cd /apps/ git clone https://github.com/aws-samples/awsome-distributed-training.git -cd awsome-distributed-training/3.test_cases/15.bionemo +cd awsome-distributed-training/3.test_cases/14.bionemo ``` ## 3. Pull Image @@ -159,11 +159,10 @@ sbatch 2.esm1nv_pretrain.slurm Before kicking off training, first train, validation and test datasets are indexed and dataloaders are created and then you should see an example output like below: ```bash -Epoch 0: 0%| | 5393/1100000 [56:21<190:37:22, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.640, global_step=4942.0, consumed_samples=4. -Epoch 0: 0%| | 5394/1100000 [56:21<190:37:00, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.640, global_step=4942.0, consumed_samples=4. -Epoch 0: 0%| | 5394/1100000 [56:21<190:37:00, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.640, global_step=4943.0, consumed_samples=4. -Epoch 0: 0%| | 5395/1100000 [56:22<190:36:32, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.640, global_step=4943.0, consumed_samples=4. -Epoch 0: 0%| | 5395/1100000 [56:22<190:36:32, 1.60it/s, loss=2.64, v_num=, reduced_train_loss=2.650, global_step=4944.0, consumed_samples=4. +Epoch 0: 3%|▎ | 34103/1100000 [5:28:58<171:22:21, 1.73it/s, loss=2.52, v_num=, reduced_train_loss=2.510, global_step=3.1e+4, consumed_samples=2.54e+8, val_loss=2.510] +Epoch 0: 3%|▎ | 34106/1100000 [5:29:00<171:22:19, 1.73it/s, loss=2.52, v_num=, reduced_train_loss=2.520, global_step=3.1e+4, consumed_samples=2.54e+8, val_loss=2.510] +Epoch 0: 3%|▎ | 34109/1100000 [5:29:02<171:22:09, 1.73it/s, loss=2.52, v_num=, reduced_train_loss=2.520, global_step=3.1e+4, consumed_samples=2.54e+8, val_loss=2.510] +Epoch 0: 3%|▎ | 34112/1100000 [5:29:03<171:22:00, 1.73it/s, loss=2.52, v_num=, reduced_train_loss=2.520, global_step=3.1e+4, consumed_samples=2.54e+8, val_loss=2.510] ``` From 795a7c3abaf4e4242156d473f82cca2213b39d22 Mon Sep 17 00:00:00 2001 From: Ankur Srivastava <101727556+awsankur@users.noreply.github.com> Date: Fri, 16 Feb 2024 21:26:16 -0800 Subject: [PATCH 4/6] Update 2.esm1nv_pretrain.slurm --- 3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm b/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm index ec56996f..e5060581 100644 --- a/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm +++ b/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm @@ -77,4 +77,4 @@ srun -l "${ARGS[@]}" python3 /workspace/bionemo/examples/protein/esm1nv/pretrai model.data.dataset.val=${VAL_FILES} \ model.data.dataset.test=${TEST_FILES} \ model.data.index_mapping_dir=${DATA_PATH} \ - ++model.dwnstr_task_validation.dataset.dataset_path=/workspace/bionemo/examples/tests/test_data/protein/downstream + ++model.dwnstr_task_validation.enabled=False From e25acb914193ab8be26b09bff76073eb04f5f541 Mon Sep 17 00:00:00 2001 From: Ankur Srivastava Date: Thu, 22 Feb 2024 15:51:57 -0800 Subject: [PATCH 5/6] Updated to address all PR comments Signed-off-by: Ankur Srivastava --- 3.test_cases/14.bionemo/0.Dockerfile | 21 ++-- 3.test_cases/14.bionemo/1.uniref50.slurm | 4 +- .../14.bionemo/2.esm1nv_pretrain.slurm | 2 +- 3.test_cases/14.bionemo/README.md | 104 +++++++----------- 3.test_cases/14.bionemo/requirements.txt | 7 +- 5 files changed, 56 insertions(+), 82 deletions(-) diff --git a/3.test_cases/14.bionemo/0.Dockerfile b/3.test_cases/14.bionemo/0.Dockerfile index 8d99f1ec..4f10e54f 100644 --- a/3.test_cases/14.bionemo/0.Dockerfile +++ b/3.test_cases/14.bionemo/0.Dockerfile @@ -1,9 +1,9 @@ FROM nvcr.io/nvidia/clara/bionemo-framework:latest -ARG EFA_INSTALLER_VERSION=latest -ARG AWS_OFI_NCCL_VERSION=v1.7.3-aws +ARG EFA_INSTALLER_VERSION=1.30.0 +ARG AWS_OFI_NCCL_VERSION=v1.7.4-aws ARG NCCL_TESTS_VERSION=master -ARG NCCL_VERSION=v2.18.5-1 +ARG NCCL_VERSION=v2.18.6-1 RUN apt-get update -y RUN apt-get remove -y --allow-change-held-packages \ libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev @@ -47,6 +47,7 @@ RUN git clone https://github.com/NVIDIA/gdrcopy.git /opt/gdrcopy \ && make lib_install install \ && cd /opt/gdrcopy/tests \ && make \ + && make install \ && mv gdrcopy_copylat gdrcopy_copybw gdrcopy_sanity gdrcopy_apiperf /usr/bin/ ################################################# @@ -58,13 +59,6 @@ RUN cd $HOME \ && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ && rm -rf $HOME/aws-efa-installer -################################################### -## Install NCCL -RUN git clone https://github.com/NVIDIA/nccl -b ${NCCL_VERSION} /opt/nccl \ - && cd /opt/nccl \ - && make -j src.build CUDA_HOME=/usr/local/cuda \ - NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90" - ################################################### ## Install AWS-OFI-NCCL plugin RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y @@ -73,11 +67,12 @@ RUN export OPAL_PREFIX="" \ && cd /opt/aws-ofi-nccl \ && git checkout ${AWS_OFI_NCCL_VERSION} \ && ./autogen.sh \ - && ./configure --prefix=/opt/aws-ofi-nccl/install \ - --with-libfabric=/opt/amazon/efa/ \ + && ./configure --prefix=/opt/aws-ofi-nccl \ + --with-libfabric=/opt/amazon/efa \ --with-cuda=/usr/local/cuda \ --with-nccl=/opt/nccl/build \ - --with-mpi=/opt/amazon/openmpi/ \ + --with-mpi=/opt/amazon/openmpi \ + --enable-platform-aws && make && make install ################################################### diff --git a/3.test_cases/14.bionemo/1.uniref50.slurm b/3.test_cases/14.bionemo/1.uniref50.slurm index 92cbda39..06ce13b6 100644 --- a/3.test_cases/14.bionemo/1.uniref50.slurm +++ b/3.test_cases/14.bionemo/1.uniref50.slurm @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --nodes=1 # number of nodes #SBATCH --ntasks-per-node=1 # n tasks per machine (one task per gpu) -#SBATCH --gpus-per-node=8 +#SBATCH --cpus-per-task=16 #SBATCH --exclusive # exclusive node access #SBATCH --output slurm-uniref-%j.out @@ -11,7 +11,7 @@ ########################### # default variables for Enroot -: "${IMAGE:=$(pwd)/bionemo.sqsh}" +: "${IMAGE:=$(pwd)/${ENROOT_IMAGE}}" : "${DATA_PATH:=/fsx}" : "${FSX_MOUNT:=$DATA_PATH:$DATA_PATH}" diff --git a/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm b/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm index e5060581..3e30bc64 100644 --- a/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm +++ b/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm @@ -11,7 +11,7 @@ ########################### # default variables for Enroot -: "${IMAGE:=$(pwd)/bionemo.sqsh}" +: "${IMAGE:=$(pwd)/${ENROOT_IMAGE}}" : "${DATA_PATH:=/fsx}" : "${FSX_MOUNT:=$DATA_PATH:$DATA_PATH}" diff --git a/3.test_cases/14.bionemo/README.md b/3.test_cases/14.bionemo/README.md index be2844b6..17d909c3 100644 --- a/3.test_cases/14.bionemo/README.md +++ b/3.test_cases/14.bionemo/README.md @@ -4,52 +4,31 @@ NVIDIA BioNeMo is a domain-specific machine learning framework for training and 1. A NeMo-based training framework to enable ML teams to create training and inference jobs via Python scripts. submitted via DGX-hosted notebooks 2. A web application that enabled scientists to create inference jobs and visualize output data. -At GTC 2023, BioNeMo supported 9 models: -MegaMolBART -ESM-1nv -OpenFold -AlphaFold2 -DiffDock -ESMFold -ESM-2nv -MoFlow -ProtGPT-2 -ProtT5nv - -Since then, NVIDIA has also announced support for three additional models -EquiDock -MolMIM -DiffDock +|Num| BioNeMo Model Support | +|:-:|:--------------------------------------------------------------------------------------------:| +| 1 | [ESM-1nv](https://docs.nvidia.com/bionemo-framework/latest/models/esm1-nv.html) | +| 2 | [ESM-2nv](https://docs.nvidia.com/bionemo-framework/latest/models/esm2-nv.html) | +| 3 | [MegaMolBART](https://docs.nvidia.com/bionemo-framework/latest/models/megamolbart.html) | +| 4 | [DiffDock](https://docs.nvidia.com/bionemo-framework/latest/models/diffdock.html) | +| 5 | [EquiDock](https://docs.nvidia.com/bionemo-framework/latest/models/equidock.html) | +| 6 | [ProtT5nv](https://docs.nvidia.com/bionemo-framework/latest/models/prott5nv.html) | + This project provides a guide to run [Nvidia's BioNemo](https://docs.nvidia.com/bionemo-framework/latest/index.html) on AWS ParallelCluster and pretrain the popular [ESM models](https://github.com/facebookresearch/esm) specifically the [ESM1nv](https://docs.nvidia.com/bionemo-framework/latest/notebooks/model_training_esm1nv.html) model. ## 0. Prerequisites -0. You have access to the bionemo container.You can get access to the container from NGC. You may also follow the instructions provided [here](https://docs.nvidia.com/bionemo-framework/latest/quickstart-fw.html) -1. Have a slurm based parallelcluster created with a FSx for Lustre filesystem mounted. - -## 1. Install NGC CLI and Login - -Follow the steps below to install the NGC CLI and login to NGC Container Registry. This is needed before you can pull the BioNemo container. +0. You have access to the bionemo container. To get the access to BioNeMo, visit the [information website](https://www.nvidia.com/en-us/clara/bionemo/). -0. Generate API Key: https://ngc.nvidia.com/setup/api-key -1. Install NGC CLI: https://ngc.nvidia.com/setup/installers/cli -2. Login -``` -docker login nvcr.io -Username: $oauthtoken -Password: API_KEY -``` -Please make note that the Username is exactly `"$oauthtoken"`. +1. Have a slurm based AWS ParallelCluster created with a FSx for Lustre filesystem mounted. Below we are presenting instructions for a cluster with compute nodes instantiated with an Ubuntu based AMI. -## 2. Install Nvidia Container CLI +## 1. Install Nvidia Container CLI -### 2.1 If you have created your cluster with [DLAMI](https://aws.amazon.com/machine-learning/amis/) or your custom AMI, please make sure `libnvidia-container cli` is installed. You can follow the instructions below to install it. +### 1.1 If you have created your cluster with the AWS ParallelCluster Base AMI or [DLAMI](https://aws.amazon.com/machine-learning/amis/) or your custom AMI, please make sure `libnvidia-container cli` is installed. You can follow the instructions below to install it. -### 2.2 To install libnvidia-container cli: -https://github.com/NVIDIA/libnvidia-container -https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html +### 1.2 To install libnvidia-container cli: +We need [libnvidia-container cli](https://github.com/NVIDIA/libnvidia-container) to train models in an Nvidia container. We follow the instructions [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). This installation needs to be done in each compute node. ``` curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ @@ -61,7 +40,9 @@ curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dear && sudo apt-get install libnvidia-container1 \ && sudo apt-get install libnvidia-container-tools ``` -### 2.3 You can set the Nemo Multimodal version and others as environment variables: +### 1.3 You can set the Nemo Multimodal version and others as environment variables: + +SSH into the head node of your cluster and run: ``` export PYTHON_VERSION=3.10 @@ -74,37 +55,31 @@ export ENROOT_IMAGE=/apps/${DOCKER_IMAGE_NAME} export DATASET_PATH=/fsx/ ``` -## 2.4. Pull this github repo +## 1.4. Pull this github repo ```bash cd /apps/ git clone https://github.com/aws-samples/awsome-distributed-training.git -cd awsome-distributed-training/3.test_cases/14.bionemo +cp -r /apps/awsome-distributed-training/3.test_cases/14.bionemo/* ./apps/ ``` -## 3. Pull Image -SSH into the head node of your cluster and run +## 2. Pull Image -``` +```bash cd /apps/ docker pull nvcr.io/nvidia/clara/bionemo-framework:latest ``` -## 4. Run container on Head Node [Optional] -Once the above image is pulled, you can run the container on the head node like below. Here we are running the container just to be able to copy launcher scripts on the host machine. If you need to run the container on the compute nodes, you would need to add `--gpus all` flag to the run command. It is recommended to have the docker run flags like below, as recommended by Nvidia PyTorch containers, otherwise you may potentially run into an error like [this](https://github.com/NVIDIA/Megatron-LM/issues/516) - -``` - docker run -it nvcr.io/nvidia/clara/bionemo-framework:latest bash -``` - -## 5. Create Conda env +## 3. Create Conda env We need a conda environment that has the necessary dependencies for submitting multiple arrays of slurm jobs via [HYDRA](https://github.com/facebookresearch/hydra) which NeMo uses to configuring both NeMo models and the PyTorch Lightning Trainer. ``` +# Miniconda is already installed if you are using the DLAMI but needs installation with Base AMI + wget -O miniconda.sh "https://repo.anaconda.com/miniconda/${MINICONDA_INSTALLER}.sh" \ && bash miniconda.sh -b -p /apps/.conda \ && /apps/.conda/bin/conda init bash -source /home/ubuntu/.bashrc +source ~/.bashrc conda create --name bionemo python=${PYTHON_VERSION} source activate bionemo @@ -116,7 +91,7 @@ All package versions in the above `requirements.txt` file is recommended from Nv -## 6. Build customized docker image +## 4. Build customized docker image To achieve target performance of Nemo-Multimodal with EFA on P5 and P4de instances, we provide a customized `3.test_cases/14.nemo-multimodal/0.Dockerfile` and we can build a image like below: @@ -124,31 +99,29 @@ To achieve target performance of Nemo-Multimodal with EFA on P5 and P4de instanc docker build -t ${DOCKER_IMAGE_NAME}:${TAG} -f 0.Dockerfile . ``` -## 7. Convert image +## 5. Convert image Convert the Docker container image to an [Enroot](https://github.com/NVIDIA/enroot) squash file that will be stored in `/apps`. This step takes a few minutes. ``` enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${DOCKER_IMAGE_NAME} ``` -## 8. Download and preprocess data -We will use the popular [UniRef50](https://www.uniprot.org/help/uniref) dataset for pretraining. We will use BioNemo's in-built functionality to download and pre-process data. To this end, we provide `prepare_uniref50.py` file to do so like below: +## 6. Download and preprocess data +We will use the popular [UniRef50](https://www.uniprot.org/help/uniref) dataset for pretraining. We will use BioNemo's in-built functionality to download and pre-process data. To this end, we provide `prepare_uniref50.py` file to do so. You can edit the above to download and process [UniRef90]((https://www.uniprot.org/help/uniref)). To run the above python code on your slurm cluster in the BioNemo cluster execute the following: -```python -from bionemo.data import UniRef50Preprocess -data = UniRef50Preprocess(root_directory='/fsx') -data.prepare_dataset(source='uniprot') +```bash +sbatch 1.uniref50.slurm ``` -You can edit the above to download and process [UniRef90]((https://www.uniprot.org/help/uniref)). To run the above python code on your slurm cluster in the BioNemo cluster execute the following: +This will download raw data in `/fsx/raw/` and save pre-processed `train, validation and test` csv files in `/fsx/processed/`. The log files for submitted jobs are written to the local directory. To check the status of the datasets download job, you can tail the log file: ```bash -sbatch 1.uniref50.slurm +tail -f slurm-uniref-.out ``` -This will download raw data in `/fsx/raw/` and save pre-processed `train, validation and test` csv files in `/fsx/processed/` -## 9. Pretrain ESM models + +## 7. Pretrain ESM models Now we are ready to submit distributed training jobs to pretrain `ESM1nv` models. We provide the `2.esm1nv_pretrain.slurm` script to run training 4 `p4de.24xlarge` nodes with `8xA100 80 GB` GPUs. Make sure data paths and model configuration is correct if you are running on custom data. To kick off distributed training execute: ```bash @@ -165,5 +138,10 @@ Epoch 0: 3%|▎ | 34109/1100000 [5:29:02<171:22:09, 1.73it/s, loss=2. Epoch 0: 3%|▎ | 34112/1100000 [5:29:03<171:22:00, 1.73it/s, loss=2.52, v_num=, reduced_train_loss=2.520, global_step=3.1e+4, consumed_samples=2.54e+8, val_loss=2.510] ``` +## 8. Run container on Head Node [Troubleshooting] +Once the above image is pulled, you can run the container on the head node like below. This step could be used for troubleshooting purposes. Here we are running the container just to be able to copy launcher scripts on the host machine. If you need to run the container on the compute nodes, you would need to add `--gpus all` flag to the run command. It is recommended to have the docker run flags like below, as recommended by Nvidia PyTorch containers, otherwise you may potentially run into an error like [this](https://github.com/NVIDIA/Megatron-LM/issues/516) +``` + docker run -it nvcr.io/nvidia/clara/bionemo-framework:latest bash +``` diff --git a/3.test_cases/14.bionemo/requirements.txt b/3.test_cases/14.bionemo/requirements.txt index fdbba4ea..c4b8d5fc 100644 --- a/3.test_cases/14.bionemo/requirements.txt +++ b/3.test_cases/14.bionemo/requirements.txt @@ -1,5 +1,5 @@ -dask -huggingface_hub>=0.13.0 +dask==2024.2.0 +huggingface_hub>=0.13.0,<0.14.0 hydra-core>=1.2.0,<1.3 img2dataset omegaconf>=2.2,<2.3 @@ -7,4 +7,5 @@ pynvml==11.4.1 requests==2.31.0 tqdm==4.62.3 zstandard==0.15.2 -opencv-python-headless==4.8.0.74 \ No newline at end of file +opencv-python-headless==4.8.0.74 +numba==0.59.0 \ No newline at end of file From 99e4438e6a7e39ae2342d2a59ca41e7dfc131cf3 Mon Sep 17 00:00:00 2001 From: Ankur Srivastava <101727556+awsankur@users.noreply.github.com> Date: Fri, 23 Feb 2024 16:31:51 -0800 Subject: [PATCH 6/6] Update 2.esm1nv_pretrain.slurm Added FI_EFA_USE_HUGE_PAGE=0 in the train sbatch script --- 3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm | 2 ++ 1 file changed, 2 insertions(+) diff --git a/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm b/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm index 3e30bc64..8470b0ae 100644 --- a/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm +++ b/3.test_cases/14.bionemo/2.esm1nv_pretrain.slurm @@ -5,6 +5,8 @@ #SBATCH --exclusive # exclusive node access #SBATCH --output slurm-esm1nv-train-%j.out +export FI_EFA_USE_HUGE_PAGE=0 + ########################### ###### User Variables #####