Merge pull request #174 from aws-samples/gpt-neox

Pythia GPT-NeoX test case
aws-samples · Mar 11, 2024 · 8faf0df · 8faf0df
2 parents 1a02173 + 4f5ba3a
commit 8faf0df
Show file tree

Hide file tree

Showing 18 changed files with 1,627 additions and 0 deletions.
diff --git a/3.test_cases/15.gpt-neox/0.gpt-neox.dockerfile b/3.test_cases/15.gpt-neox/0.gpt-neox.dockerfile
@@ -0,0 +1,175 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+####################################################################################################
+# This is a sample Dockerfile, with optional stanzas. Please read through this Dockerfile,
+# understand what it does, then create your own Dockerfile.
+#
+# Sample build instructions:
+#
+#     docker build --progress=plain -t nvidia-pt-od:latest -f 0.nvcr-pytorch-aws.dockerfile .
+#     rm /fsx/nvidia-pt-od__latest.sqsh ; enroot import -o /fsx/nvidia-pt-od__latest.sqsh dockerd://nvidia-pt-od:latest
+#
+# Compute nodes (aka build nodes) are transient, so we need to keep the docker image on shared fs,
+# which head node can load into its local registry.
+#
+#     # Build node: save image to file
+#     docker save nvidia-pt-od:latest > /fsx/nvidia-pt-od__latest.tar
+#
+#     # Load image to local docker registry -> on head node, or new compute/build node.
+#     docker load < /fsx/nvidia-pt-od__latest.tar
+####################################################################################################
+FROM nvcr.io/nvidia/pytorch:23.12-py3
+ENV DEBIAN_FRONTEND=noninteractive
+
+# The three must-be-built packages.
+# Efa-installer>=1.29.0 required for nccl>=2.19.0 to avoid libfabric NCCL error.
+ENV EFA_INSTALLER_VERSION=1.30.0
+ENV AWS_OFI_NCCL_VERSION=1.8.1-aws
+ENV NCCL_VERSION=2.19.3-1
+ENV NCCL_TESTS_VERSION=master
+
+RUN apt-get update -y
+RUN apt-get remove -y --allow-change-held-packages \
+                      libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1
+
+# We noticed that since 23.09, we can't just delete the whole /opt/hpcx/, otherwise `import torch`
+# complains about missing libuc?.so.
+RUN rm -rf /opt/hpcx/ompi \
+    && rm -rf /usr/local/mpi \
+    && rm -rf /opt/hpcx/nccl_rdma_sharp_plugin \
+    && ldconfig
+ENV OPAL_PREFIX=
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    git \
+    gcc \
+    vim \
+    kmod \
+    openssh-client \
+    openssh-server \
+    build-essential \
+    curl \
+    autoconf \
+    libtool \
+    gdb \
+    automake \
+    cmake \
+    apt-utils \
+    libhwloc-dev \
+    aptitude && \
+    DEBIAN_FRONTEND=noninteractive apt autoremove -y
+
+# EFA
+RUN apt-get update && \
+    cd /tmp && \
+    curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz  && \
+    tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
+    cd aws-efa-installer && \
+    # ONLY add `--skip-kmod`, `--no-verify` and `--skip-limit-conf` flags to container image.
+    # Those three flags must NOT be used on the host.
+    #
+    # Explanations:
+    # - to build EFA in the Dockerfile, we added --skip-kmod and --no-verify. Without these flags,
+    #   the Dockerfile will fail to build. If installing EFA on the host and not in a container,
+    #   please remove these flags.
+    # - The --skip-limit-conf can be retained in Dockerfile, but it's redundant as the host already
+    #   has these limits set by efa_installer.
+    ./efa_installer.sh -y -g -d --skip-kmod --no-verify --skip-limit-conf && \
+    ldconfig && \
+    rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*
+ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH
+ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH
+
+
+####################################################################################################
+# [CUSTOM_NCCL_OPTION_1] Uncomment below stanza to install another NCCL version using the official
+# binaries.
+#
+# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the
+# aws-ofi-ccnl.
+####################################################################################################
+#ENV NCCL_VERSION=2.19.3-1
+#RUN cd /opt && \
+#    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \
+#    dpkg -i cuda-keyring_1.0-1_all.deb && \
+#    apt update && \
+#    apt install -y libnccl2==${NCCL_VERSION} libnccl-dev==${NCCL_VERSION} && \
+#    echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf
+
+
+####################################################################################################
+# [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones. The
+# benefits of installing to the same location as the built-in version are:
+#
+# 1. There's only ever a single libnccl version offered by this image, preventing application from
+#    mistakenly chooses a wrong version.
+# 2. No longer needing extra settings for LD_LIBRARY_PATH or LD_PRELOAD.
+#
+# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the
+# aws-ofi-ccnl.
+####################################################################################################
+# RUN apt-get remove -y libnccl2 libnccl-dev \
+#    && cd /tmp \
+#    && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
+#    && cd nccl \
+#    && make -j src.build BUILDDIR=/usr \
+#    # Build for p4 & p5.
+#    NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \
+#    && rm -rf /tmp/nccl \
+#    && echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf
+# Note: disabled custom NCCL installation as PyTorch container preinstalled 2.19.3 (cf. https://github.com/aws-samples/awsome-distributed-training/pull/174#discussion_r1519045216)
+# yet Keeping the above instructions for future updates.
+
+# NCCL EFA Plugin
+RUN mkdir -p /tmp && \
+    cd /tmp && \
+    curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
+    tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
+    rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
+    mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \
+    cd /tmp/aws-ofi-nccl && \
+    ./autogen.sh && \
+    ./configure --prefix=/opt/amazon/efa \
+        --with-libfabric=/opt/amazon/efa \
+        --with-cuda=/usr/local/cuda \
+        --enable-platform-aws \
+        --with-mpi=/opt/amazon/openmpi && \
+    make -j$(nproc) install && \
+    rm -rf /tmp/aws-ofi/nccl
+
+# Do this to minimize the ld path env vars that users need to define when running this image.
+RUN echo "/usr/local/lib"      >> /etc/ld.so.conf.d/local.conf && \
+    echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \
+    ldconfig
+
+ENV OMPI_MCA_pml=^cm,ucx            \
+    OMPI_MCA_btl=tcp,self           \
+    OMPI_MCA_btl_tcp_if_exclude=lo,docker0 \
+    OPAL_PREFIX=/opt/amazon/openmpi \
+    # https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352
+    # https://github.com/pytorch/pytorch/issues/68893
+    NCCL_SOCKET_IFNAME=^docker,lo
+
+ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
+
+# NCCL-tests: always good to include this as a diagnostic tool.
+RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
+    && cd /opt/nccl-tests \
+    && git checkout ${NCCL_TESTS_VERSION} \
+    && make MPI=1 \
+    MPI_HOME=/opt/amazon/openmpi \
+    CUDA_HOME=/usr/local/cuda \
+    NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_80,code=sm_80"
+
+
+# Install GPT-NeoX  and its dependencies
+RUN git clone https://github.com/EleutherAI/gpt-neox.git \
+    && cd gpt-neox \
+    && pip install -r requirements/requirements.txt \
+    && pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB \ 
+    && pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard \
+    && python ./megatron/fused_kernels/setup.py install # optional, if using fused kernels 
+# Rebuild newer flash-attn
+RUN MAX_JOBS=192 FLASH_ATTENTION_FORCE_BUILD=TRUE pip install flash-attn==2.5.5 --upgrade
+WORKDIR /workspace/gpt-neox
+COPY src/c4_prepare_data.py c4_prepare_data.py
diff --git a/3.test_cases/15.gpt-neox/1.build-image.sbatch b/3.test_cases/15.gpt-neox/1.build-image.sbatch
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH -N 1 # number of nodes to use
+#SBATCH --job-name=build-neox-image # name of your job
+#SBATCH --output=logs/%x_%j.out # logfile for stdout
+#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs
+
+set -euxo pipefail
+
+# default variables for Enroot, if these variables are defined then use them
+: "${APPS_PATH:=/fsx/apps}"
+: "${IMAGE:=$APPS_PATH/gpt-neox.sqsh}"
+
+ENROOT_IMAGE=gpt-neox
+docker build -t ${ENROOT_IMAGE} -f 0.gpt-neox.dockerfile .
+# Remove old sqsh file if exists
+if [ -f ${ENROOT_IMAGE}.sqsh ] ; then
+    rm ${ENROOT_IMAGE}.sqsh
+fi
+enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest
+mv ${ENROOT_IMAGE}.sqsh ${IMAGE}
diff --git a/3.test_cases/15.gpt-neox/2.prepare-data.sbatch b/3.test_cases/15.gpt-neox/2.prepare-data.sbatch
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH -N 1 # number of nodes to use
+#SBATCH --job-name=neox-dataprep # name of your job
+#SBATCH --output=logs/%x_%j.out # logfile for stdout
+#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs
+#SBATCH --exclusive
+
+# default variables for Enroot, if these variables are defined then use them
+: "${APPS_PATH:=/fsx/apps}"
+: "${IMAGE:=$APPS_PATH/gpt-neox.sqsh}"
+: "${FSX_PATH:=/fsx}"
+: "${DATASET:=c4_subset}"
+: "${DATA_PATH:=$FSX_PATH/$DATASET}"
+: "${MODEL_PATH:=$FSX_PATH/gpt-neox}"
+: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}"
+
+echo "Retrieve and preprocess ${DATASET}"
+
+declare -a ARGS=(
+    --container-image $IMAGE
+    --container-mounts $CONTAINER_MOUNT
+)
+srun -l "${ARGS[@]}" python c4_prepare_data.py -d ${DATA_PATH} -t HFTokenizer \
+        --vocab-file ${MODEL_PATH}/tokenizers/20B_tokenizer.json ${DATASET}
diff --git a/3.test_cases/15.gpt-neox/3.train.sbatch b/3.test_cases/15.gpt-neox/3.train.sbatch
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --job-name="neox"
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=8
+#SBATCH --gpus-per-node=8 # Number of GPU per node
+#SBATCH --gres=gpu:8
+#SBATCH --output=logs/%x_%j.out # logfile for stdout
+#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs
+#SBATCH --wait-all-nodes=1
+#SBATCH --exclusive
+set -uxo pipefail
+
+# default variables for Enroot, if these variables are defined then use them
+: "${APPS_PATH:=/fsx/apps}"
+: "${IMAGE:=$APPS_PATH/gpt-neox.sqsh}"
+: "${FSX_PATH:=/fsx}"
+: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}"
+## EFA settings
+export FI_LOG_LEVEL=1
+export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons
+export FI_EFA_USE_HUGE_PAGE=0
+# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352
+# https://github.com/pytorch/pytorch/issues/68893
+export NCCL_SOCKET_IFNAME=en
+export NCCL_ASYNC_ERROR_HANDLING=1
+#export NCCL_DEBUG=INFO
+
+export MODEL_CONFIG=${PWD}/configs/pythia/1-4B.json
+#export MODEL_CONFIG=${PWD}/configs/pythia/2-8B.json
+# export MODEL_CONFIG=${PWD}/configs/pythia/12B.json
+# Some potentially useful distributed environment variables
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
+export NODES_ARRAY=($NODES)
+export HEAD_NODE=${NODES_ARRAY[0]}
+export MASTER_ADDR=$(hostname --ip-address)
+export MASTER_PORT=$RANDOM
+export NNODES=$SLURM_JOB_NUM_NODES
+export NPROC=$SLURM_GPUS_PER_NODE
+export WORLD_SIZE=$(( $NNODES * $NPROC ))
+
+declare -a ARGS=(
+    --container-image $IMAGE
+    --container-mounts $CONTAINER_MOUNT
+)
+
+declare -a TORCHRUN_ARGS=(
+    # change this to match the number of gpus per node:
+    --master_addr $MASTER_ADDR \
+    --master_port $RANDOM \
+    --nproc_per_node=8 \
+    --nnodes=$SLURM_JOB_NUM_NODES \
+    --rdzv_id=$SLURM_JOB_ID \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint=$(hostname) \
+)
+
+srun -l "${ARGS[@]}" python deepy.py train.py ${MODEL_CONFIG} 
diff --git a/3.test_cases/15.gpt-neox/Makefile b/3.test_cases/15.gpt-neox/Makefile
@@ -0,0 +1,12 @@
+ENROOT_IMAGE=gpt-neox
+
+all: build clean import
+
+build:
+	docker build -t ${ENROOT_IMAGE} -f 0.gpt-neox.dockerfile .
+
+clean:
+	-rm ${ENROOT_IMAGE}.sqsh
+
+import:
+	enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest