-
Notifications
You must be signed in to change notification settings - Fork 93
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #250 from aws-samples/nemo-on-eks
Nemo on eks
- Loading branch information
Showing
13 changed files
with
825 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: MIT-0 | ||
|
||
# DOCKER_BUILDKIT=1 docker build --progress plain -t aws-nemo-megatron:latest . | ||
|
||
FROM nvcr.io/nvidia/nemo:24.01.framework | ||
|
||
ENV DEBIAN_FRONTEND=noninteractive | ||
ENV EFA_INSTALLER_VERSION=1.30.0 | ||
ENV AWS_OFI_NCCL_VERSION=1.8.1-aws | ||
ENV NCCL_VERSION=2.20.3-1 | ||
ENV NCCL_TESTS_VERSION=2.13.9 | ||
|
||
RUN apt-get update -y | ||
RUN apt-get remove -y --allow-change-held-packages \ | ||
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 | ||
|
||
RUN rm -rf /opt/hpcx/ompi \ | ||
&& rm -rf /usr/local/mpi \ | ||
&& rm -fr /opt/hpcx/nccl_rdma_sharp_plugin \ | ||
&& ldconfig | ||
ENV OPAL_PREFIX= | ||
RUN apt-get install -y --allow-unauthenticated \ | ||
git \ | ||
gcc \ | ||
vim \ | ||
kmod \ | ||
openssh-client \ | ||
openssh-server \ | ||
build-essential \ | ||
curl \ | ||
autoconf \ | ||
libtool \ | ||
gdb \ | ||
automake \ | ||
cmake \ | ||
apt-utils \ | ||
libhwloc-dev \ | ||
aptitude && \ | ||
apt autoremove -y | ||
|
||
# Uncomment below stanza to install the latest NCCL | ||
# Require efa-installer>=1.29.0 for nccl-2.19.0 to avoid libfabric gave NCCL error. | ||
|
||
RUN apt-get remove -y libnccl2 libnccl-dev \ | ||
&& cd /tmp \ | ||
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ | ||
&& cd nccl \ | ||
&& make -j src.build BUILDDIR=/usr/local \ | ||
# nvcc to target p5 and p4 instances | ||
NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_80,code=sm_80" \ | ||
&& rm -rf /tmp/nccl | ||
|
||
# EFA | ||
RUN apt-get update && \ | ||
apt-get install -y libhwloc-dev && \ | ||
cd /tmp && \ | ||
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ | ||
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ | ||
cd aws-efa-installer && \ | ||
./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ | ||
ldconfig && \ | ||
rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* | ||
|
||
# NCCL EFA Plugin | ||
RUN mkdir -p /tmp && \ | ||
cd /tmp && \ | ||
curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ | ||
tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ | ||
rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ | ||
mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \ | ||
cd /tmp/aws-ofi-nccl && \ | ||
./autogen.sh && \ | ||
./configure --prefix=/opt/amazon/efa \ | ||
--with-libfabric=/opt/amazon/efa \ | ||
--with-cuda=/usr/local/cuda \ | ||
--enable-platform-aws \ | ||
--with-mpi=/opt/amazon/openmpi && \ | ||
make -j$(nproc) install && \ | ||
rm -rf /tmp/aws-ofi/nccl | ||
|
||
# NCCL | ||
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \ | ||
echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \ | ||
ldconfig | ||
|
||
ENV OMPI_MCA_pml=^cm,ucx \ | ||
OMPI_MCA_btl=tcp,self \ | ||
OMPI_MCA_btl_tcp_if_exclude=lo,docker0 \ | ||
OPAL_PREFIX=/opt/amazon/openmpi \ | ||
NCCL_SOCKET_IFNAME=^docker,lo | ||
|
||
# NCCL-tests | ||
RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ | ||
&& cd /opt/nccl-tests \ | ||
&& git checkout ${NCCL_TESTS_VERSION} \ | ||
&& make MPI=1 \ | ||
MPI_HOME=/opt/amazon/openmpi \ | ||
CUDA_HOME=/usr/local/cuda \ | ||
# nvcc to target p5 and p4 instances | ||
NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_80,code=sm_80" | ||
|
||
|
||
# To avoid this error | ||
#TypeError: Descriptors cannot not be created directly. | ||
#If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0. | ||
#If you cannot immediately regenerate your protos, some other possible workarounds are: | ||
# 1. Downgrade the protobuf package to 3.20.x or lower. | ||
# 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower). | ||
|
||
RUN pip install pip install protobuf==3.20.* |
Oops, something went wrong.