Merge pull request #270 from aws-samples/improvements/#269_nccl_optim…

…ization Improvements/#269 nccl optimization
aws-samples · Apr 29, 2024 · b1e89bb · b1e89bb
2 parents 70d9937 + 5567b42
commit b1e89bb
Show file tree

Hide file tree

Showing 7 changed files with 253 additions and 278 deletions.
diff --git a/micro-benchmarks/nccl-tests/README.md b/micro-benchmarks/nccl-tests/README.md
diff --git a/micro-benchmarks/nccl-tests/kubernetes/nccl-tests.yaml b/micro-benchmarks/nccl-tests/kubernetes/nccl-tests.yaml
@@ -22,49 +22,48 @@ spec:
                value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH
              - name: PATH
                value: $PATH:/opt/amazon/efa/bin:/usr/bin
-             - name: XLA_FLAGS
-               value: "--xla_gpu_cuda_data_dir=/usr/local/cuda"
-             - name: TF_XLA_FLAGS
-               value: "--tf_xla_cpu_global_jit"
-             - name: NCCL_DEBUG
-               value: INFO
             command:
             - /opt/amazon/openmpi/bin/mpirun
             - --allow-run-as-root
             - --tag-output
             - -np
             - "16"
-            - -bind-to
+            - -N
+            - "8"
+            - --bind-to
             - none
-            - -map-by
-            - slot
             - -x
             - PATH
             - -x
             - LD_LIBRARY_PATH
             - -x
-            - XLA_FLAGS
+            - FI_PROVIDER=efa
             - -x
-            - TF_XLA_FLAGS
+            - FI_EFA_USE_DEVICE_RDMA=1
+            - -x
+            - FI_EFA_FORK_SAFE=1
             - -x
             - NCCL_DEBUG=INFO
             - -x
-            - NCCL_NVLS_ENABLE=1
+            - NCCL_BUFFSIZE=8388608
+            - -x
+            - NCCL_P2P_NET_CHUNKSIZE=524288
             - --mca
             - pml
-            - ^cm
+            - ^cm,ucx
+            - --mca
+            - btl
+            - tcp,self
             - --mca
-            - pml_rsh_agent=ssh
-            - --oversubscribe
+            - btl_tcp_if_exclude
+            - lo,docker0,veth_def_agent
             - /opt/nccl-tests/build/all_reduce_perf
             - -b
-            - "1"
+            - "8"
             - -e
-            - 2G
+            - "16G"
             - -f
             - "2"
-            - -t
-            - "1"
             - -g
             - "1"
             - -c

diff --git a/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile b/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile
@@ -2,47 +2,55 @@
 # SPDX-License-Identifier: MIT-0
 FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
 
+ARG GDRCOPY_VERSION=v2.4.1
 ARG EFA_INSTALLER_VERSION=1.31.0
 ARG AWS_OFI_NCCL_VERSION=v1.8.1-aws
-ARG NCCL_TESTS_VERSION=2.13.9
-ARG NCCL_VERSION=2.20.3
+ARG NCCL_VERSION=v2.20.3-1
+ARG NCCL_TESTS_VERSION=v2.13.9
 
 RUN apt-get update -y
 RUN apt-get remove -y --allow-change-held-packages \
-    libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev
+    ibverbs-utils \
+    libibverbs-dev \
+    libibverbs1 \
+    libmlx5-1 \
+    libnccl2 \
+    libnccl-dev
 
 RUN rm -rf /opt/hpcx \
     && rm -rf /usr/local/mpi \
     && rm -f /etc/ld.so.conf.d/hpcx.conf \
     && ldconfig
+
 ENV OPAL_PREFIX=
 
 RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    apt-utils \
+    autoconf \
+    automake \
+    build-essential \
+    check \
+    cmake \
+    curl \
+    debhelper \
+    devscripts \
     git \
     gcc \
-    vim \
+    gdb \
     kmod \
+    libsubunit-dev \
+    libtool \
     openssh-client \
     openssh-server \
-    build-essential \
-    curl \
-    autoconf \
-    libtool \
-    gdb \
-    automake \
+    pkg-config \
     python3-distutils \
-    cmake \
-    apt-utils \
-    devscripts \
-    debhelper \
-    libsubunit-dev \
-    check \
-    pkg-config
+    vim
 
 RUN mkdir -p /var/run/sshd
 RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
     echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
     sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH
 ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
 
@@ -52,12 +60,14 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
 
 #################################################
 ## Install NVIDIA GDRCopy
-#RUN git clone https://github.com/NVIDIA/gdrcopy.git /opt/gdrcopy \
-#    && cd /opt/gdrcopy \
-#    && make lib_install install \
-#    && cd /opt/gdrcopy/tests \
-#    && make \
-#    && mv copylat copybw sanity apiperf /usr/bin/
+RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
+    && cd /tmp/gdrcopy \
+    && make prefix=/opt/gdrcopy install
+
+ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat/:$LIBRARY_PATH
+ENV CPATH /opt/gdrcopy/include:$CPATH
+ENV PATH /opt/gdrcopy/bin:$PATH
 
 #################################################
 ## Install EFA installer
@@ -70,36 +80,50 @@ RUN cd $HOME \
 
 ###################################################
 ## Install NCCL
-RUN git clone -b v${NCCL_VERSION}-1 https://github.com/NVIDIA/nccl.git  /opt/nccl \
+RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git  /opt/nccl \
     && cd /opt/nccl \
     && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
-    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90"
 
 ###################################################
 ## Install AWS-OFI-NCCL plugin
-RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y
-RUN export OPAL_PREFIX="" \
-    && git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
-    && cd /opt/aws-ofi-nccl \
-    && git checkout ${AWS_OFI_NCCL_VERSION} \
-    && ./autogen.sh \
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
+RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
+    && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
+    && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
     && ./configure --prefix=/opt/aws-ofi-nccl/install \
         --with-mpi=/opt/amazon/openmpi \
         --with-libfabric=/opt/amazon/efa \
         --with-cuda=/usr/local/cuda \
         --enable-platform-aws \
-    && make -j $(nproc) && make install
+    && make -j $(nproc) \
+    && make install \
+    && cd .. \
+    && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
+    && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz
 
 ###################################################
 ## Install NCCL-tests
-RUN git clone -b v${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
+RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
     && cd /opt/nccl-tests \
     && make -j $(nproc) \
     MPI=1 \
     MPI_HOME=/opt/amazon/openmpi/ \
     CUDA_HOME=/usr/local/cuda \
     NCCL_HOME=/opt/nccl/build \
-    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90"
 
 RUN rm -rf /var/lib/apt/lists/*
+
+## Set Open MPI variables to exclude network interface and conduit.
+ENV OMPI_MCA_pml=^cm,ucx            \
+    OMPI_MCA_btl=tcp,self           \
+    OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\
+    OPAL_PREFIX=/opt/amazon/openmpi \
+    NCCL_SOCKET_IFNAME=^docker,lo
+
+## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
+ENV PMIX_MCA_gds=hash
+
+## Set LD_PRELOAD for NCCL library
 ENV LD_PRELOAD /opt/nccl/build/lib/libnccl.so
diff --git a/micro-benchmarks/nccl-tests/slurm/nccl-3collectives.sbatch b/micro-benchmarks/nccl-tests/slurm/nccl-3collectives.sbatch
diff --git a/...benchmarks/nccl-tests/slurm/dl-ami.sbatch → ...slurm/nccl-tests-deep-learning-ami.sbatch b/...benchmarks/nccl-tests/slurm/dl-ami.sbatch → ...slurm/nccl-tests-deep-learning-ami.sbatch
@@ -1,25 +1,39 @@
 #!/bin/bash
-#SBATCH -N 2
-#SBATCH --exclusive
-
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: MIT-0
 
+#SBATCH --job-name=nccl-all_reduce_perf # name of your job
+#SBATCH --nodes=2 # number of nodes to use, 24 p4d(e) = 192 A100 GPUs
+#SBATCH --ntasks-per-node 8 # Number of GPU per node
+#SBATCH --output %x_%j.out
+#SBATCH --error %x_%j.err
+#SBATCH --exclusive
+
 # This script is designed to run on the Deep Learning AMI, Ubuntu 20.04
 # See https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-20-04/
 set -ex
 
 # Get Hostname to Instance ID mapping
 mpirun -N 1 bash -c 'echo $(hostname) ➡️ $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")'
 
+
+### NCCL_BUFFSIZE increase the send queue depth and can turn NCCL communications into non-blocking.
+### https://www.usenix.org/system/files/atc23-choi.pdf
+
+### NCCL_P2P_NET_CHUNKSIZE Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications
+### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html
+
 # run all_reduce test
 mpirun -n $((8 * SLURM_JOB_NUM_NODES)) -N 8 \
         -x FI_PROVIDER=efa \
 	-x FI_EFA_USE_DEVICE_RDMA=1  \
-	-x RDMAV_FORK_SAFE=1 \
+	-x FI_EFA_FORK_SAFE=1 \
 	-x LD_LIBRARY_PATH=/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/aws-ofi-nccl/lib:/usr/local/lib:/usr/lib:$LD_LIBRARY_PATH \
 	-x NCCL_DEBUG=INFO \
-	--mca pml ^cm \
+	-x NCCL_BUFFSIZE=8388608 \
+	-x NCCL_P2P_NET_CHUNKSIZE=524288 \
+	--mca pml ^cm,ucx \
 	--mca btl tcp,self \
 	--mca btl_tcp_if_exclude lo,docker0,veth_def_agent \
-	--bind-to none /usr/local/cuda-12.2/efa/test-cuda-12.2/all_reduce_perf -b 8 -e 2G -f 2 -g 1 -c 1 -n 100
+	--bind-to none /usr/local/cuda-12.2/efa/test-cuda-12.2/all_reduce_perf -b 8 -e 16G -f 2 -g 1 -c 1 -n 100
+