Skip to content

Commit

Permalink
Merge pull request #270 from aws-samples/improvements/#269_nccl_optim…
Browse files Browse the repository at this point in the history
…ization

Improvements/#269 nccl optimization
  • Loading branch information
mhuguesaws authored Apr 29, 2024
2 parents 70d9937 + 5567b42 commit b1e89bb
Show file tree
Hide file tree
Showing 7 changed files with 253 additions and 278 deletions.
199 changes: 132 additions & 67 deletions micro-benchmarks/nccl-tests/README.md

Large diffs are not rendered by default.

37 changes: 18 additions & 19 deletions micro-benchmarks/nccl-tests/kubernetes/nccl-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,49 +22,48 @@ spec:
value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH
- name: PATH
value: $PATH:/opt/amazon/efa/bin:/usr/bin
- name: XLA_FLAGS
value: "--xla_gpu_cuda_data_dir=/usr/local/cuda"
- name: TF_XLA_FLAGS
value: "--tf_xla_cpu_global_jit"
- name: NCCL_DEBUG
value: INFO
command:
- /opt/amazon/openmpi/bin/mpirun
- --allow-run-as-root
- --tag-output
- -np
- "16"
- -bind-to
- -N
- "8"
- --bind-to
- none
- -map-by
- slot
- -x
- PATH
- -x
- LD_LIBRARY_PATH
- -x
- XLA_FLAGS
- FI_PROVIDER=efa
- -x
- TF_XLA_FLAGS
- FI_EFA_USE_DEVICE_RDMA=1
- -x
- FI_EFA_FORK_SAFE=1
- -x
- NCCL_DEBUG=INFO
- -x
- NCCL_NVLS_ENABLE=1
- NCCL_BUFFSIZE=8388608
- -x
- NCCL_P2P_NET_CHUNKSIZE=524288
- --mca
- pml
- ^cm
- ^cm,ucx
- --mca
- btl
- tcp,self
- --mca
- pml_rsh_agent=ssh
- --oversubscribe
- btl_tcp_if_exclude
- lo,docker0,veth_def_agent
- /opt/nccl-tests/build/all_reduce_perf
- -b
- "1"
- "8"
- -e
- 2G
- "16G"
- -f
- "2"
- -t
- "1"
- -g
- "1"
- -c
Expand Down
92 changes: 58 additions & 34 deletions micro-benchmarks/nccl-tests/nccl-tests.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,47 +2,55 @@
# SPDX-License-Identifier: MIT-0
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04

ARG GDRCOPY_VERSION=v2.4.1
ARG EFA_INSTALLER_VERSION=1.31.0
ARG AWS_OFI_NCCL_VERSION=v1.8.1-aws
ARG NCCL_TESTS_VERSION=2.13.9
ARG NCCL_VERSION=2.20.3
ARG NCCL_VERSION=v2.20.3-1
ARG NCCL_TESTS_VERSION=v2.13.9

RUN apt-get update -y
RUN apt-get remove -y --allow-change-held-packages \
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev
ibverbs-utils \
libibverbs-dev \
libibverbs1 \
libmlx5-1 \
libnccl2 \
libnccl-dev

RUN rm -rf /opt/hpcx \
&& rm -rf /usr/local/mpi \
&& rm -f /etc/ld.so.conf.d/hpcx.conf \
&& ldconfig

ENV OPAL_PREFIX=

RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
apt-utils \
autoconf \
automake \
build-essential \
check \
cmake \
curl \
debhelper \
devscripts \
git \
gcc \
vim \
gdb \
kmod \
libsubunit-dev \
libtool \
openssh-client \
openssh-server \
build-essential \
curl \
autoconf \
libtool \
gdb \
automake \
pkg-config \
python3-distutils \
cmake \
apt-utils \
devscripts \
debhelper \
libsubunit-dev \
check \
pkg-config
vim

RUN mkdir -p /var/run/sshd
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config

ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH
ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH

Expand All @@ -52,12 +60,14 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \

#################################################
## Install NVIDIA GDRCopy
#RUN git clone https://github.com/NVIDIA/gdrcopy.git /opt/gdrcopy \
# && cd /opt/gdrcopy \
# && make lib_install install \
# && cd /opt/gdrcopy/tests \
# && make \
# && mv copylat copybw sanity apiperf /usr/bin/
RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
&& cd /tmp/gdrcopy \
&& make prefix=/opt/gdrcopy install

ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH
ENV LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat/:$LIBRARY_PATH
ENV CPATH /opt/gdrcopy/include:$CPATH
ENV PATH /opt/gdrcopy/bin:$PATH

#################################################
## Install EFA installer
Expand All @@ -70,36 +80,50 @@ RUN cd $HOME \

###################################################
## Install NCCL
RUN git clone -b v${NCCL_VERSION}-1 https://github.com/NVIDIA/nccl.git /opt/nccl \
RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \
&& cd /opt/nccl \
&& make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90"

###################################################
## Install AWS-OFI-NCCL plugin
RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y
RUN export OPAL_PREFIX="" \
&& git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
&& cd /opt/aws-ofi-nccl \
&& git checkout ${AWS_OFI_NCCL_VERSION} \
&& ./autogen.sh \
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
&& tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
&& cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-mpi=/opt/amazon/openmpi \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
&& make -j $(nproc) && make install
&& make -j $(nproc) \
&& make install \
&& cd .. \
&& rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
&& rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz

###################################################
## Install NCCL-tests
RUN git clone -b v${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
&& cd /opt/nccl-tests \
&& make -j $(nproc) \
MPI=1 \
MPI_HOME=/opt/amazon/openmpi/ \
CUDA_HOME=/usr/local/cuda \
NCCL_HOME=/opt/nccl/build \
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90"

RUN rm -rf /var/lib/apt/lists/*

## Set Open MPI variables to exclude network interface and conduit.
ENV OMPI_MCA_pml=^cm,ucx \
OMPI_MCA_btl=tcp,self \
OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\
OPAL_PREFIX=/opt/amazon/openmpi \
NCCL_SOCKET_IFNAME=^docker,lo

## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
ENV PMIX_MCA_gds=hash

## Set LD_PRELOAD for NCCL library
ENV LD_PRELOAD /opt/nccl/build/lib/libnccl.so
74 changes: 0 additions & 74 deletions micro-benchmarks/nccl-tests/slurm/nccl-3collectives.sbatch

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,25 +1,39 @@
#!/bin/bash
#SBATCH -N 2
#SBATCH --exclusive

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

#SBATCH --job-name=nccl-all_reduce_perf # name of your job
#SBATCH --nodes=2 # number of nodes to use, 24 p4d(e) = 192 A100 GPUs
#SBATCH --ntasks-per-node 8 # Number of GPU per node
#SBATCH --output %x_%j.out
#SBATCH --error %x_%j.err
#SBATCH --exclusive

# This script is designed to run on the Deep Learning AMI, Ubuntu 20.04
# See https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-20-04/
set -ex

# Get Hostname to Instance ID mapping
mpirun -N 1 bash -c 'echo $(hostname) ➡️ $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")'


### NCCL_BUFFSIZE increase the send queue depth and can turn NCCL communications into non-blocking.
### https://www.usenix.org/system/files/atc23-choi.pdf

### NCCL_P2P_NET_CHUNKSIZE Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications
### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html

# run all_reduce test
mpirun -n $((8 * SLURM_JOB_NUM_NODES)) -N 8 \
-x FI_PROVIDER=efa \
-x FI_EFA_USE_DEVICE_RDMA=1 \
-x RDMAV_FORK_SAFE=1 \
-x FI_EFA_FORK_SAFE=1 \
-x LD_LIBRARY_PATH=/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/aws-ofi-nccl/lib:/usr/local/lib:/usr/lib:$LD_LIBRARY_PATH \
-x NCCL_DEBUG=INFO \
--mca pml ^cm \
-x NCCL_BUFFSIZE=8388608 \
-x NCCL_P2P_NET_CHUNKSIZE=524288 \
--mca pml ^cm,ucx \
--mca btl tcp,self \
--mca btl_tcp_if_exclude lo,docker0,veth_def_agent \
--bind-to none /usr/local/cuda-12.2/efa/test-cuda-12.2/all_reduce_perf -b 8 -e 2G -f 2 -g 1 -c 1 -n 100
--bind-to none /usr/local/cuda-12.2/efa/test-cuda-12.2/all_reduce_perf -b 8 -e 16G -f 2 -g 1 -c 1 -n 100

Loading

0 comments on commit b1e89bb

Please sign in to comment.