Skip to content

Commit

Permalink
Update bert training dockerfile to include amazon specific packages f…
Browse files Browse the repository at this point in the history
…or MPI, NCCL, and EFA.
  • Loading branch information
mattcjo committed Jul 11, 2024
1 parent b5aedc7 commit 7f9480b
Showing 1 changed file with 57 additions and 7 deletions.
64 changes: 57 additions & 7 deletions e2e2/test/images/bert-training/Dockerfile.bert-training
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,61 @@ COPY requirements.txt /app/
RUN python -m pip install --upgrade pip && \
pip install --no-cache-dir -r requirements.txt

# Install OpenMPI
RUN apt-get update && \
apt-get install -y openmpi-bin openmpi-common libopenmpi-dev && \
rm -rf /var/lib/apt/lists/*
ARG EFA_INSTALLER_VERSION=latest
# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0
ARG AWS_OFI_NCCL_VERSION=1.9.1
ARG NCCL_TESTS_VERSION=master

# Install necessary dependencies and remove old ones
RUN apt-get update -y && \
apt-get remove -y --allow-change-held-packages \
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev && \
rm -rf /opt/hpcx /usr/local/mpi /usr/local/ucx /etc/ld.so.conf.d/hpcx.conf && \
ldconfig && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
sudo git gcc vim kmod openssh-client openssh-server build-essential \
wget curl autoconf libtool gdb automake python3-distutils cmake \
apt-utils devscripts debhelper libsubunit-dev check pkg-config libhwloc-dev

# SSH configuration
RUN mkdir -p /var/run/sshd && \
sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config

# Set environment variables for OpenMPI and CUDA
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH

# Install EFA
RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& rm -rf $HOME/aws-efa-installer

# Install NCCL
RUN apt-key del 7fa2af80 && \
curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
dpkg -i cuda-keyring_1.0-1_all.deb && \
sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2

# Install AWS-OFI-NCCL plugin
RUN export OPAL_PREFIX="" && \
git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \
cd /opt/aws-ofi-nccl && \
git checkout v${AWS_OFI_NCCL_VERSION}-aws && \
./autogen.sh && \
./configure --prefix=/opt/aws-ofi-nccl/install --with-libfabric=/opt/amazon/efa/ --with-cuda=/usr/local/cuda --with-mpi=/opt/amazon/openmpi/ && \
make && make install

# Set default values for MASTER_ADDR and MASTER_PORT for local testing
ENV MASTER_ADDR=127.0.0.1
ENV MASTER_PORT=12355

# Set environment variables for OpenMPI
ENV PATH="/usr/lib64/openmpi/bin:${PATH}"
ENV LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:${LD_LIBRARY_PATH}"
# Set environment variables for NCCL and clean up
ENV NCCL_PROTO simple
RUN rm -rf /var/lib/apt/lists/*
# Ensure NCCL library is found first
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

0 comments on commit 7f9480b

Please sign in to comment.