Skip to content

Commit

Permalink
update versions
Browse files Browse the repository at this point in the history
  • Loading branch information
Pavani-Panakanti committed Nov 7, 2024
1 parent a564165 commit 930ef19
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ spec:
- -x
- NCCL_DEBUG=INFO
- -x
- NCCL_ALGO=RING
- NCCL_BUFFSIZE={{.NcclBuffSize}}
- -x
- NCCL_P2P_NET_CHUNKSIZE="524288"
- -x
- NCCL_TUNER_PLUGIN=/opt/aws-ofi-nccl/install/lib/libnccl-ofi-tuner.so
- --mca
Expand Down
4 changes: 4 additions & 0 deletions e2e2/test/cases/nvidia/mpi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ type ncclTestManifestTplVars struct {
NvidiaTestImage string
EfaInterfacePerNode int
MaxBytes string
NcclBuffSize string
}

func TestMPIJobPytorchTraining(t *testing.T) {
Expand Down Expand Up @@ -97,9 +98,11 @@ func TestMPIJobPytorchTraining(t *testing.T) {
t.Fatal(fmt.Errorf("nvidiaTestImage must be set to run unit test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/nvidia/Dockerfile to build the image and -nvidiaTestImage to set the image url"))
}
maxBytes := "2G"
ncclBuffSize := "4194304"
if slices.Contains(instanceSupportsRdmaRead, *nodeType) {
t.Log("Instance supports RDMA")
maxBytes = "16G"
ncclBuffSize = "8388608"
}
renderedMpiJobNcclTestMultiNodeManifest, err := fwext.RenderManifests(mpiJobNcclTestMultiNodeManifest, ncclTestManifestTplVars{
// one of the nodes will be used for the master pod
Expand All @@ -109,6 +112,7 @@ func TestMPIJobPytorchTraining(t *testing.T) {
NvidiaTestImage: *nvidiaTestImage,
EfaInterfacePerNode: efaPerNode,
MaxBytes: maxBytes,
NcclBuffSize: ncclBuffSize,
})
if err != nil {
t.Fatal(err)
Expand Down
11 changes: 5 additions & 6 deletions e2e2/test/images/nvidia/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
ARG UBUNTU_MAJOR_VERSION=22

ARG CUDA_MAJOR_VERSION=12
ARG CUDA_MINOR_VERSION=5
ARG CUDA_MINOR_VERSION=6

# Start with the NVIDIA CUDA base image
FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.1-devel-ubuntu$UBUNTU_MAJOR_VERSION.04
Expand Down Expand Up @@ -41,7 +41,6 @@ RUN apt install -y \
cmake \
apt-utils \
libhwloc-dev \
cuda-demo-suite-$CUDA_MAJOR_VERSION-$CUDA_MINOR_VERSION \
datacenter-gpu-manager

RUN mkdir -p /var/run/sshd \
Expand All @@ -64,11 +63,11 @@ RUN cd /tmp \
# Install NCCL
RUN apt update \
&& apt install -y \
libnccl2=2.22.3-1+cuda12.5 \
libnccl-dev=2.22.3-1+cuda12.5
libnccl2=2.23.4-1+cuda12.6 \
libnccl-dev=2.23.4-1+cuda12.6

# Install AWS-OFI-NCCL plugin
ARG AWS_OFI_NCCL_VERSION=1.11.0-aws
ARG AWS_OFI_NCCL_VERSION=1.12.1-aws
RUN cd tmp \
&& curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCCL_VERSION/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION.tar.gz | tar xvz \
&& cd aws-ofi-nccl-$AWS_OFI_NCCL_VERSION \
Expand All @@ -82,7 +81,7 @@ RUN cd tmp \
&& make install

# Install NCCL Tests
ARG NCCL_TESTS_VERSION=2.13.10
ARG NCCL_TESTS_VERSION=2.13.11
RUN cd /tmp \
&& curl -sL https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v$NCCL_TESTS_VERSION.tar.gz | tar xvz \
&& cd nccl-tests-$NCCL_TESTS_VERSION \
Expand Down

0 comments on commit 930ef19

Please sign in to comment.