Skip to content

Commit

Permalink
Merge pull request #210 from aws-samples/improvements/#209_nccl_versi…
Browse files Browse the repository at this point in the history
…on_validation

Improvements/#209 nccl version validation
  • Loading branch information
mhuguesaws authored Mar 12, 2024
2 parents 8faf0df + f6b803a commit 1647104
Showing 1 changed file with 10 additions and 12 deletions.
22 changes: 10 additions & 12 deletions 4.validation_and_observability/0.nccl-tests/0.nccl-tests.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
# SPDX-License-Identifier: MIT-0
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04

ARG EFA_INSTALLER_VERSION=1.28.0
ARG AWS_OFI_NCCL_VERSION=v1.7.3-aws
ARG NCCL_TESTS_VERSION=master
ARG NCCL_VERSION=2.18.5
ARG EFA_INSTALLER_VERSION=1.30.0
ARG AWS_OFI_NCCL_VERSION=v1.8.1-aws
ARG NCCL_TESTS_VERSION=2.13.9
ARG NCCL_VERSION=2.19.4

RUN apt-get update -y
RUN apt-get remove -y --allow-change-held-packages \
Expand Down Expand Up @@ -70,7 +70,7 @@ RUN cd $HOME \

###################################################
## Install NCCL
RUN git clone https://github.com/NVIDIA/nccl -b v${NCCL_VERSION}-1 /opt/nccl \
RUN git clone -b v${NCCL_VERSION}-1 https://github.com/NVIDIA/nccl.git /opt/nccl \
&& cd /opt/nccl \
&& make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
Expand All @@ -84,24 +84,22 @@ RUN export OPAL_PREFIX="" \
&& git checkout ${AWS_OFI_NCCL_VERSION} \
&& ./autogen.sh \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-libfabric=/opt/amazon/efa/ \
--with-cuda=/usr/local/cuda \
--with-nccl=/opt/nccl/build \
--with-mpi=/opt/amazon/openmpi/ \
--with-mpi=/opt/amazon/openmpi \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
&& make -j $(nproc) && make install

###################################################
## Install NCCL-tests
RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
RUN git clone -b v${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
&& cd /opt/nccl-tests \
&& git checkout ${NCCL_TESTS_VERSION} \
&& make -j $(nproc) \
MPI=1 \
MPI_HOME=/opt/amazon/openmpi/ \
CUDA_HOME=/usr/local/cuda \
NCCL_HOME=/opt/nccl/build \
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"

ENV NCCL_PROTO simple
RUN rm -rf /var/lib/apt/lists/*
ENV LD_PRELOAD /opt/nccl/build/lib/libnccl.so

0 comments on commit 1647104

Please sign in to comment.