-
Notifications
You must be signed in to change notification settings - Fork 93
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #174 from aws-samples/gpt-neox
Pythia GPT-NeoX test case
- Loading branch information
Showing
18 changed files
with
1,627 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
#################################################################################################### | ||
# This is a sample Dockerfile, with optional stanzas. Please read through this Dockerfile, | ||
# understand what it does, then create your own Dockerfile. | ||
# | ||
# Sample build instructions: | ||
# | ||
# docker build --progress=plain -t nvidia-pt-od:latest -f 0.nvcr-pytorch-aws.dockerfile . | ||
# rm /fsx/nvidia-pt-od__latest.sqsh ; enroot import -o /fsx/nvidia-pt-od__latest.sqsh dockerd://nvidia-pt-od:latest | ||
# | ||
# Compute nodes (aka build nodes) are transient, so we need to keep the docker image on shared fs, | ||
# which head node can load into its local registry. | ||
# | ||
# # Build node: save image to file | ||
# docker save nvidia-pt-od:latest > /fsx/nvidia-pt-od__latest.tar | ||
# | ||
# # Load image to local docker registry -> on head node, or new compute/build node. | ||
# docker load < /fsx/nvidia-pt-od__latest.tar | ||
#################################################################################################### | ||
FROM nvcr.io/nvidia/pytorch:23.12-py3 | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
|
||
# The three must-be-built packages. | ||
# Efa-installer>=1.29.0 required for nccl>=2.19.0 to avoid libfabric NCCL error. | ||
ENV EFA_INSTALLER_VERSION=1.30.0 | ||
ENV AWS_OFI_NCCL_VERSION=1.8.1-aws | ||
ENV NCCL_VERSION=2.19.3-1 | ||
ENV NCCL_TESTS_VERSION=master | ||
|
||
RUN apt-get update -y | ||
RUN apt-get remove -y --allow-change-held-packages \ | ||
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 | ||
|
||
# We noticed that since 23.09, we can't just delete the whole /opt/hpcx/, otherwise `import torch` | ||
# complains about missing libuc?.so. | ||
RUN rm -rf /opt/hpcx/ompi \ | ||
&& rm -rf /usr/local/mpi \ | ||
&& rm -rf /opt/hpcx/nccl_rdma_sharp_plugin \ | ||
&& ldconfig | ||
ENV OPAL_PREFIX= | ||
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ | ||
git \ | ||
gcc \ | ||
vim \ | ||
kmod \ | ||
openssh-client \ | ||
openssh-server \ | ||
build-essential \ | ||
curl \ | ||
autoconf \ | ||
libtool \ | ||
gdb \ | ||
automake \ | ||
cmake \ | ||
apt-utils \ | ||
libhwloc-dev \ | ||
aptitude && \ | ||
DEBIAN_FRONTEND=noninteractive apt autoremove -y | ||
|
||
# EFA | ||
RUN apt-get update && \ | ||
cd /tmp && \ | ||
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ | ||
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ | ||
cd aws-efa-installer && \ | ||
# ONLY add `--skip-kmod`, `--no-verify` and `--skip-limit-conf` flags to container image. | ||
# Those three flags must NOT be used on the host. | ||
# | ||
# Explanations: | ||
# - to build EFA in the Dockerfile, we added --skip-kmod and --no-verify. Without these flags, | ||
# the Dockerfile will fail to build. If installing EFA on the host and not in a container, | ||
# please remove these flags. | ||
# - The --skip-limit-conf can be retained in Dockerfile, but it's redundant as the host already | ||
# has these limits set by efa_installer. | ||
./efa_installer.sh -y -g -d --skip-kmod --no-verify --skip-limit-conf && \ | ||
ldconfig && \ | ||
rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* | ||
ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH | ||
ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH | ||
|
||
|
||
#################################################################################################### | ||
# [CUSTOM_NCCL_OPTION_1] Uncomment below stanza to install another NCCL version using the official | ||
# binaries. | ||
# | ||
# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the | ||
# aws-ofi-ccnl. | ||
#################################################################################################### | ||
#ENV NCCL_VERSION=2.19.3-1 | ||
#RUN cd /opt && \ | ||
# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ | ||
# dpkg -i cuda-keyring_1.0-1_all.deb && \ | ||
# apt update && \ | ||
# apt install -y libnccl2==${NCCL_VERSION} libnccl-dev==${NCCL_VERSION} && \ | ||
# echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf | ||
|
||
|
||
#################################################################################################### | ||
# [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones. The | ||
# benefits of installing to the same location as the built-in version are: | ||
# | ||
# 1. There's only ever a single libnccl version offered by this image, preventing application from | ||
# mistakenly chooses a wrong version. | ||
# 2. No longer needing extra settings for LD_LIBRARY_PATH or LD_PRELOAD. | ||
# | ||
# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the | ||
# aws-ofi-ccnl. | ||
#################################################################################################### | ||
# RUN apt-get remove -y libnccl2 libnccl-dev \ | ||
# && cd /tmp \ | ||
# && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ | ||
# && cd nccl \ | ||
# && make -j src.build BUILDDIR=/usr \ | ||
# # Build for p4 & p5. | ||
# NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \ | ||
# && rm -rf /tmp/nccl \ | ||
# && echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf | ||
# Note: disabled custom NCCL installation as PyTorch container preinstalled 2.19.3 (cf. https://github.com/aws-samples/awsome-distributed-training/pull/174#discussion_r1519045216) | ||
# yet Keeping the above instructions for future updates. | ||
|
||
# NCCL EFA Plugin | ||
RUN mkdir -p /tmp && \ | ||
cd /tmp && \ | ||
curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ | ||
tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ | ||
rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ | ||
mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \ | ||
cd /tmp/aws-ofi-nccl && \ | ||
./autogen.sh && \ | ||
./configure --prefix=/opt/amazon/efa \ | ||
--with-libfabric=/opt/amazon/efa \ | ||
--with-cuda=/usr/local/cuda \ | ||
--enable-platform-aws \ | ||
--with-mpi=/opt/amazon/openmpi && \ | ||
make -j$(nproc) install && \ | ||
rm -rf /tmp/aws-ofi/nccl | ||
|
||
# Do this to minimize the ld path env vars that users need to define when running this image. | ||
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \ | ||
echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \ | ||
ldconfig | ||
|
||
ENV OMPI_MCA_pml=^cm,ucx \ | ||
OMPI_MCA_btl=tcp,self \ | ||
OMPI_MCA_btl_tcp_if_exclude=lo,docker0 \ | ||
OPAL_PREFIX=/opt/amazon/openmpi \ | ||
# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 | ||
# https://github.com/pytorch/pytorch/issues/68893 | ||
NCCL_SOCKET_IFNAME=^docker,lo | ||
|
||
ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" | ||
|
||
# NCCL-tests: always good to include this as a diagnostic tool. | ||
RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ | ||
&& cd /opt/nccl-tests \ | ||
&& git checkout ${NCCL_TESTS_VERSION} \ | ||
&& make MPI=1 \ | ||
MPI_HOME=/opt/amazon/openmpi \ | ||
CUDA_HOME=/usr/local/cuda \ | ||
NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_80,code=sm_80" | ||
|
||
|
||
# Install GPT-NeoX and its dependencies | ||
RUN git clone https://github.com/EleutherAI/gpt-neox.git \ | ||
&& cd gpt-neox \ | ||
&& pip install -r requirements/requirements.txt \ | ||
&& pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB \ | ||
&& pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard \ | ||
&& python ./megatron/fused_kernels/setup.py install # optional, if using fused kernels | ||
# Rebuild newer flash-attn | ||
RUN MAX_JOBS=192 FLASH_ATTENTION_FORCE_BUILD=TRUE pip install flash-attn==2.5.5 --upgrade | ||
WORKDIR /workspace/gpt-neox | ||
COPY src/c4_prepare_data.py c4_prepare_data.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#!/bin/bash | ||
|
||
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
#SBATCH -N 1 # number of nodes to use | ||
#SBATCH --job-name=build-neox-image # name of your job | ||
#SBATCH --output=logs/%x_%j.out # logfile for stdout | ||
#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs | ||
|
||
set -euxo pipefail | ||
|
||
# default variables for Enroot, if these variables are defined then use them | ||
: "${APPS_PATH:=/fsx/apps}" | ||
: "${IMAGE:=$APPS_PATH/gpt-neox.sqsh}" | ||
|
||
ENROOT_IMAGE=gpt-neox | ||
docker build -t ${ENROOT_IMAGE} -f 0.gpt-neox.dockerfile . | ||
# Remove old sqsh file if exists | ||
if [ -f ${ENROOT_IMAGE}.sqsh ] ; then | ||
rm ${ENROOT_IMAGE}.sqsh | ||
fi | ||
enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest | ||
mv ${ENROOT_IMAGE}.sqsh ${IMAGE} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/bin/bash | ||
|
||
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
#SBATCH -N 1 # number of nodes to use | ||
#SBATCH --job-name=neox-dataprep # name of your job | ||
#SBATCH --output=logs/%x_%j.out # logfile for stdout | ||
#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs | ||
#SBATCH --exclusive | ||
|
||
# default variables for Enroot, if these variables are defined then use them | ||
: "${APPS_PATH:=/fsx/apps}" | ||
: "${IMAGE:=$APPS_PATH/gpt-neox.sqsh}" | ||
: "${FSX_PATH:=/fsx}" | ||
: "${DATASET:=c4_subset}" | ||
: "${DATA_PATH:=$FSX_PATH/$DATASET}" | ||
: "${MODEL_PATH:=$FSX_PATH/gpt-neox}" | ||
: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}" | ||
|
||
echo "Retrieve and preprocess ${DATASET}" | ||
|
||
declare -a ARGS=( | ||
--container-image $IMAGE | ||
--container-mounts $CONTAINER_MOUNT | ||
) | ||
srun -l "${ARGS[@]}" python c4_prepare_data.py -d ${DATA_PATH} -t HFTokenizer \ | ||
--vocab-file ${MODEL_PATH}/tokenizers/20B_tokenizer.json ${DATASET} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#!/bin/bash | ||
|
||
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
#SBATCH --job-name="neox" | ||
#SBATCH --nodes=2 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --gpus-per-node=8 # Number of GPU per node | ||
#SBATCH --gres=gpu:8 | ||
#SBATCH --output=logs/%x_%j.out # logfile for stdout | ||
#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs | ||
#SBATCH --wait-all-nodes=1 | ||
#SBATCH --exclusive | ||
set -uxo pipefail | ||
|
||
# default variables for Enroot, if these variables are defined then use them | ||
: "${APPS_PATH:=/fsx/apps}" | ||
: "${IMAGE:=$APPS_PATH/gpt-neox.sqsh}" | ||
: "${FSX_PATH:=/fsx}" | ||
: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}" | ||
## EFA settings | ||
export FI_LOG_LEVEL=1 | ||
export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons | ||
export FI_EFA_USE_HUGE_PAGE=0 | ||
# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 | ||
# https://github.com/pytorch/pytorch/issues/68893 | ||
export NCCL_SOCKET_IFNAME=en | ||
export NCCL_ASYNC_ERROR_HANDLING=1 | ||
#export NCCL_DEBUG=INFO | ||
|
||
export MODEL_CONFIG=${PWD}/configs/pythia/1-4B.json | ||
#export MODEL_CONFIG=${PWD}/configs/pythia/2-8B.json | ||
# export MODEL_CONFIG=${PWD}/configs/pythia/12B.json | ||
# Some potentially useful distributed environment variables | ||
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | ||
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | ||
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` | ||
export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) | ||
export NODES_ARRAY=($NODES) | ||
export HEAD_NODE=${NODES_ARRAY[0]} | ||
export MASTER_ADDR=$(hostname --ip-address) | ||
export MASTER_PORT=$RANDOM | ||
export NNODES=$SLURM_JOB_NUM_NODES | ||
export NPROC=$SLURM_GPUS_PER_NODE | ||
export WORLD_SIZE=$(( $NNODES * $NPROC )) | ||
|
||
declare -a ARGS=( | ||
--container-image $IMAGE | ||
--container-mounts $CONTAINER_MOUNT | ||
) | ||
|
||
declare -a TORCHRUN_ARGS=( | ||
# change this to match the number of gpus per node: | ||
--master_addr $MASTER_ADDR \ | ||
--master_port $RANDOM \ | ||
--nproc_per_node=8 \ | ||
--nnodes=$SLURM_JOB_NUM_NODES \ | ||
--rdzv_id=$SLURM_JOB_ID \ | ||
--rdzv_backend=c10d \ | ||
--rdzv_endpoint=$(hostname) \ | ||
) | ||
|
||
srun -l "${ARGS[@]}" python deepy.py train.py ${MODEL_CONFIG} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
ENROOT_IMAGE=gpt-neox | ||
|
||
all: build clean import | ||
|
||
build: | ||
docker build -t ${ENROOT_IMAGE} -f 0.gpt-neox.dockerfile . | ||
|
||
clean: | ||
-rm ${ENROOT_IMAGE}.sqsh | ||
|
||
import: | ||
enroot import -o ${ENROOT_IMAGE}.sqsh dockerd://${ENROOT_IMAGE}:latest |
Oops, something went wrong.