From 930ef1970e9542be9128de0e16d10e7dbebd48f8 Mon Sep 17 00:00:00 2001 From: Pavani Panakanti Date: Thu, 7 Nov 2024 21:19:01 +0000 Subject: [PATCH] update versions --- .../manifests/mpi-job-nccl-test-multi-node.yaml | 4 +++- e2e2/test/cases/nvidia/mpi_test.go | 4 ++++ e2e2/test/images/nvidia/Dockerfile | 11 +++++------ 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/e2e2/test/cases/nvidia/manifests/mpi-job-nccl-test-multi-node.yaml b/e2e2/test/cases/nvidia/manifests/mpi-job-nccl-test-multi-node.yaml index fe85fdd98..b0002391a 100644 --- a/e2e2/test/cases/nvidia/manifests/mpi-job-nccl-test-multi-node.yaml +++ b/e2e2/test/cases/nvidia/manifests/mpi-job-nccl-test-multi-node.yaml @@ -37,7 +37,9 @@ spec: - -x - NCCL_DEBUG=INFO - -x - - NCCL_ALGO=RING + - NCCL_BUFFSIZE={{.NcclBuffSize}} + - -x + - NCCL_P2P_NET_CHUNKSIZE="524288" - -x - NCCL_TUNER_PLUGIN=/opt/aws-ofi-nccl/install/lib/libnccl-ofi-tuner.so - --mca diff --git a/e2e2/test/cases/nvidia/mpi_test.go b/e2e2/test/cases/nvidia/mpi_test.go index 8deb8fda2..41a73f3fe 100644 --- a/e2e2/test/cases/nvidia/mpi_test.go +++ b/e2e2/test/cases/nvidia/mpi_test.go @@ -39,6 +39,7 @@ type ncclTestManifestTplVars struct { NvidiaTestImage string EfaInterfacePerNode int MaxBytes string + NcclBuffSize string } func TestMPIJobPytorchTraining(t *testing.T) { @@ -97,9 +98,11 @@ func TestMPIJobPytorchTraining(t *testing.T) { t.Fatal(fmt.Errorf("nvidiaTestImage must be set to run unit test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/nvidia/Dockerfile to build the image and -nvidiaTestImage to set the image url")) } maxBytes := "2G" + ncclBuffSize := "4194304" if slices.Contains(instanceSupportsRdmaRead, *nodeType) { t.Log("Instance supports RDMA") maxBytes = "16G" + ncclBuffSize = "8388608" } renderedMpiJobNcclTestMultiNodeManifest, err := fwext.RenderManifests(mpiJobNcclTestMultiNodeManifest, ncclTestManifestTplVars{ // one of the nodes will be used for the master pod @@ -109,6 +112,7 @@ func TestMPIJobPytorchTraining(t *testing.T) { NvidiaTestImage: *nvidiaTestImage, EfaInterfacePerNode: efaPerNode, MaxBytes: maxBytes, + NcclBuffSize: ncclBuffSize, }) if err != nil { t.Fatal(err) diff --git a/e2e2/test/images/nvidia/Dockerfile b/e2e2/test/images/nvidia/Dockerfile index eae6090c6..af59b5041 100644 --- a/e2e2/test/images/nvidia/Dockerfile +++ b/e2e2/test/images/nvidia/Dockerfile @@ -1,7 +1,7 @@ ARG UBUNTU_MAJOR_VERSION=22 ARG CUDA_MAJOR_VERSION=12 -ARG CUDA_MINOR_VERSION=5 +ARG CUDA_MINOR_VERSION=6 # Start with the NVIDIA CUDA base image FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.1-devel-ubuntu$UBUNTU_MAJOR_VERSION.04 @@ -41,7 +41,6 @@ RUN apt install -y \ cmake \ apt-utils \ libhwloc-dev \ - cuda-demo-suite-$CUDA_MAJOR_VERSION-$CUDA_MINOR_VERSION \ datacenter-gpu-manager RUN mkdir -p /var/run/sshd \ @@ -64,11 +63,11 @@ RUN cd /tmp \ # Install NCCL RUN apt update \ && apt install -y \ - libnccl2=2.22.3-1+cuda12.5 \ - libnccl-dev=2.22.3-1+cuda12.5 + libnccl2=2.23.4-1+cuda12.6 \ + libnccl-dev=2.23.4-1+cuda12.6 # Install AWS-OFI-NCCL plugin -ARG AWS_OFI_NCCL_VERSION=1.11.0-aws +ARG AWS_OFI_NCCL_VERSION=1.12.1-aws RUN cd tmp \ && curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCCL_VERSION/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION.tar.gz | tar xvz \ && cd aws-ofi-nccl-$AWS_OFI_NCCL_VERSION \ @@ -82,7 +81,7 @@ RUN cd tmp \ && make install # Install NCCL Tests -ARG NCCL_TESTS_VERSION=2.13.10 +ARG NCCL_TESTS_VERSION=2.13.11 RUN cd /tmp \ && curl -sL https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v$NCCL_TESTS_VERSION.tar.gz | tar xvz \ && cd nccl-tests-$NCCL_TESTS_VERSION \