aws · cartermckinnon · Jul 10, 2024 · Jul 9, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -25,4 +25,9 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - run: docker build --file e2e2/test/images/neuron/Dockerfile .
+    - run: docker build --file e2e2/test/images/neuron/Dockerfile .
+  build-nvidia:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - run: docker build --file e2e2/test/images/nvidia/Dockerfile .
diff --git a/e2e2/test/cases/nvidia/manifests/job-unit-test-single-node.yaml b/e2e2/test/cases/nvidia/manifests/job-unit-test-single-node.yaml
@@ -0,0 +1,28 @@
+kind: Job
+apiVersion: batch/v1
+metadata:
+  name: unit-test-job
+  labels:
+    app: unit-test-job
+spec:
+  template:
+    metadata:
+      labels:
+        app: unit-test-job
+    spec:
+      containers:
+      - name: unit-test-container
+        image: {{.NvidiaTestImage}}
+        command: 
+        - /bin/bash
+        - ./gpu_unit_tests/unit_test
+        imagePullPolicy: Always
+        resources:
+          limits:
+            cpu: "4"
+            memory: 4Gi
+          requests:
+            cpu: "1"
+            memory: 1Gi
+      restartPolicy: Never
+  backoffLimit: 4
diff --git a/e2e2/test/images/nvidia/Dockerfile b/e2e2/test/images/nvidia/Dockerfile
@@ -0,0 +1,100 @@
+# Start with the NVIDIA CUDA base image
+FROM nvidia/cuda:12.5.0-devel-ubuntu22.04
+
+ARG EFA_INSTALLER_VERSION=latest
+# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0
+ARG AWS_OFI_NCCL_VERSION=1.9.1
+ARG NCCL_TESTS_VERSION=master
+
+# Install necessary dependencies
+RUN apt-get update -y
+RUN apt-get remove -y --allow-change-held-packages \
+    libmlx5-1 \
+    ibverbs-utils \
+    libibverbs-dev \
+    libibverbs1 \
+    libnccl2 \
+    libnccl-dev
+
+RUN rm -rf /opt/hpcx \
+    && rm -rf /usr/local/mpi \
+    && rm -rf /usr/local/ucx \
+    && rm -f /etc/ld.so.conf.d/hpcx.conf \
+    && ldconfig
+
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    sudo \
+    git \
+    gcc \
+    vim \
+    kmod \
+    openssh-client \
+    openssh-server \
+    build-essential \
+    wget curl \
+    autoconf \
+    libtool \
+    gdb \
+    automake \
+    python3-distutils \
+    cmake \
+    apt-utils \
+    devscripts \
+    debhelper \
+    libsubunit-dev \
+    check \
+    pkg-config \
+    libhwloc-dev \
+    datacenter-gpu-manager \
+    cloud-utils \ 
+    cuda-demo-suite-12-5
+
+RUN mkdir -p /var/run/sshd
+RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
+    echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
+ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH
+
+# Install EFA
+RUN cd $HOME \
+    && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && cd aws-efa-installer \
+    && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
+    && rm -rf $HOME/aws-efa-installer
+
+# Install NCCL
+RUN apt-key del 7fa2af80 \
+    && curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \
+    && dpkg -i cuda-keyring_1.0-1_all.deb \
+    && sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2
+
+## Install AWS-OFI-NCCL plugin
+RUN export OPAL_PREFIX="" \
+    && git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
+    && cd /opt/aws-ofi-nccl \
+    && git checkout v${AWS_OFI_NCCL_VERSION}-aws \
+    && ./autogen.sh \
+    && ./configure --prefix=/opt/aws-ofi-nccl/install \
+       --with-libfabric=/opt/amazon/efa/ \
+       --with-cuda=/usr/local/cuda \
+       --with-mpi=/opt/amazon/openmpi/ \
+    && make && make install
+
+# Install NCCL Tests
+RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
+    && cd /opt/nccl-tests \
+    && git checkout ${NCCL_TESTS_VERSION} \
+    && make MPI=1 \
+       MPI_HOME=/opt/amazon/openmpi/ \
+       CUDA_HOME=/usr/local/cuda 
+
+
+# Set a default command for debugging or modify as per requirements
+ENV NCCL_PROTO simple
+RUN rm -rf /var/lib/apt/lists/*
+ENV LD_PRELOAD /usr/lib/x86_64-linux-gnu/libnccl.so:$LD_PRELOAD
+
+COPY e2e2/test/images/nvidia/gpu_unit_tests ./gpu_unit_tests
+RUN chmod +x ./gpu_unit_tests/unit_test
diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/README.md b/e2e2/test/images/nvidia/gpu_unit_tests/README.md
@@ -0,0 +1,48 @@
+# What
+
+gpu_unit_tests is the unit tests for gpu enabled platforms. Idea is to create compact
+set of tests which will cover most of performance critical aspects for gpu
+platforms. Test designed to run on single instance.
+# Usage
+
+```
+# Run tests
+./unit_test
+```
+
+**Generate test data for new instance type**
+
+Step 1: Copy the `gpu_unit_tests` folder to the EC2 instance where you want to generate the data.
+
+Step 2:  Execute the following command in the `gpu_unit_tests` directory on the EC2 instance:
+```
+GENERATE_DATA=1 ./unit_test
+```
+Step 3:
+Copy the files from `tests/test_sysinfo.sh.data` (e.g., `tests/test_sysinfo.sh.data/p3.2xlarge`) to your local repository.
+
+Step 4:
+Create PR with the new `tests/test_sysinfo.sh.data/xxx`
+
+# Test list
+
+-  test_sysinfo.sh :: Validate basic system configuration by comparing it with test config
+  - test_numa_topo_topo :: check cpu/numa topology
+  - test_nvidia_gpu_count :: fail if one of GPUs is broken or is not visiable
+  - test_nvidia_fabric_status :: fail if fabric manager is not active
+  - test_nvidia_smi_topo :: fail if nvidia-smi topology is differ
+  - test_nvidia_persistence_status :: validate persistence state
+  - test_nvidia_gpu_unused :: Check that no other process are using GPUs, fail is a signal system misconfiguration.
+
+
+- 10_test_basic_cuda.sh :: Execute trivial cuda binaries, fail if cuda subsys is not healthy
+  Use demo-suite binaries https://docs.nvidia.com/cuda/demo-suite/index.html and DCGM Diagnostics https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests 
+  If this test suite fail this is a sign that cuda subsystem is not usable at all.
+  Usually this is side effect of system misconfiguration (driver or fabric manager is not loaded)
+  - test_01_device_query
+  - test_02_vector_add
+  - test_03_bandwidth
+  - test_04_bus_grind
+  - test_05_dcgm_diagnostics
+
+