Skip to content

Commit

Permalink
Merge pull request #250 from aws-samples/nemo-on-eks
Browse files Browse the repository at this point in the history
Nemo on eks
  • Loading branch information
mhuguesaws authored Apr 25, 2024
2 parents 78d9066 + 55f2f2c commit bf61f1b
Show file tree
Hide file tree
Showing 13 changed files with 825 additions and 0 deletions.
111 changes: 111 additions & 0 deletions 3.test_cases/2.nemo-launcher/EKS/0.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

# DOCKER_BUILDKIT=1 docker build --progress plain -t aws-nemo-megatron:latest .

FROM nvcr.io/nvidia/nemo:24.01.framework

ENV DEBIAN_FRONTEND=noninteractive
ENV EFA_INSTALLER_VERSION=1.30.0
ENV AWS_OFI_NCCL_VERSION=1.8.1-aws
ENV NCCL_VERSION=2.20.3-1
ENV NCCL_TESTS_VERSION=2.13.9

RUN apt-get update -y
RUN apt-get remove -y --allow-change-held-packages \
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1

RUN rm -rf /opt/hpcx/ompi \
&& rm -rf /usr/local/mpi \
&& rm -fr /opt/hpcx/nccl_rdma_sharp_plugin \
&& ldconfig
ENV OPAL_PREFIX=
RUN apt-get install -y --allow-unauthenticated \
git \
gcc \
vim \
kmod \
openssh-client \
openssh-server \
build-essential \
curl \
autoconf \
libtool \
gdb \
automake \
cmake \
apt-utils \
libhwloc-dev \
aptitude && \
apt autoremove -y

# Uncomment below stanza to install the latest NCCL
# Require efa-installer>=1.29.0 for nccl-2.19.0 to avoid libfabric gave NCCL error.

RUN apt-get remove -y libnccl2 libnccl-dev \
&& cd /tmp \
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
&& cd nccl \
&& make -j src.build BUILDDIR=/usr/local \
# nvcc to target p5 and p4 instances
NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_80,code=sm_80" \
&& rm -rf /tmp/nccl

# EFA
RUN apt-get update && \
apt-get install -y libhwloc-dev && \
cd /tmp && \
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
cd aws-efa-installer && \
./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \
ldconfig && \
rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*

# NCCL EFA Plugin
RUN mkdir -p /tmp && \
cd /tmp && \
curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \
cd /tmp/aws-ofi-nccl && \
./autogen.sh && \
./configure --prefix=/opt/amazon/efa \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
--with-mpi=/opt/amazon/openmpi && \
make -j$(nproc) install && \
rm -rf /tmp/aws-ofi/nccl

# NCCL
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \
echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \
ldconfig

ENV OMPI_MCA_pml=^cm,ucx \
OMPI_MCA_btl=tcp,self \
OMPI_MCA_btl_tcp_if_exclude=lo,docker0 \
OPAL_PREFIX=/opt/amazon/openmpi \
NCCL_SOCKET_IFNAME=^docker,lo

# NCCL-tests
RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
&& cd /opt/nccl-tests \
&& git checkout ${NCCL_TESTS_VERSION} \
&& make MPI=1 \
MPI_HOME=/opt/amazon/openmpi \
CUDA_HOME=/usr/local/cuda \
# nvcc to target p5 and p4 instances
NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_80,code=sm_80"


# To avoid this error
#TypeError: Descriptors cannot not be created directly.
#If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
#If you cannot immediately regenerate your protos, some other possible workarounds are:
# 1. Downgrade the protobuf package to 3.20.x or lower.
# 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

RUN pip install pip install protobuf==3.20.*
Loading

0 comments on commit bf61f1b

Please sign in to comment.