-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
29 changed files
with
1,094 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
28 changes: 28 additions & 0 deletions
28
e2e2/test/cases/nvidia/manifests/job-unit-test-single-node.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
kind: Job | ||
apiVersion: batch/v1 | ||
metadata: | ||
name: unit-test-job | ||
labels: | ||
app: unit-test-job | ||
spec: | ||
template: | ||
metadata: | ||
labels: | ||
app: unit-test-job | ||
spec: | ||
containers: | ||
- name: unit-test-container | ||
image: {{.NvidiaTestImage}} | ||
command: | ||
- /bin/bash | ||
- ./gpu_unit_tests/unit_test | ||
imagePullPolicy: Always | ||
resources: | ||
limits: | ||
cpu: "4" | ||
memory: 4Gi | ||
requests: | ||
cpu: "1" | ||
memory: 1Gi | ||
restartPolicy: Never | ||
backoffLimit: 4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
# Start with the NVIDIA CUDA base image | ||
FROM nvidia/cuda:12.5.0-devel-ubuntu22.04 | ||
|
||
ARG EFA_INSTALLER_VERSION=latest | ||
# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0 | ||
ARG AWS_OFI_NCCL_VERSION=1.9.1 | ||
ARG NCCL_TESTS_VERSION=master | ||
|
||
# Install necessary dependencies | ||
RUN apt-get update -y | ||
RUN apt-get remove -y --allow-change-held-packages \ | ||
libmlx5-1 \ | ||
ibverbs-utils \ | ||
libibverbs-dev \ | ||
libibverbs1 \ | ||
libnccl2 \ | ||
libnccl-dev | ||
|
||
RUN rm -rf /opt/hpcx \ | ||
&& rm -rf /usr/local/mpi \ | ||
&& rm -rf /usr/local/ucx \ | ||
&& rm -f /etc/ld.so.conf.d/hpcx.conf \ | ||
&& ldconfig | ||
|
||
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ | ||
sudo \ | ||
git \ | ||
gcc \ | ||
vim \ | ||
kmod \ | ||
openssh-client \ | ||
openssh-server \ | ||
build-essential \ | ||
wget curl \ | ||
autoconf \ | ||
libtool \ | ||
gdb \ | ||
automake \ | ||
python3-distutils \ | ||
cmake \ | ||
apt-utils \ | ||
devscripts \ | ||
debhelper \ | ||
libsubunit-dev \ | ||
check \ | ||
pkg-config \ | ||
libhwloc-dev \ | ||
datacenter-gpu-manager \ | ||
cloud-utils \ | ||
cuda-demo-suite-12-5 | ||
|
||
RUN mkdir -p /var/run/sshd | ||
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ | ||
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ | ||
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config | ||
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH | ||
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH | ||
|
||
# Install EFA | ||
RUN cd $HOME \ | ||
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ | ||
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ | ||
&& cd aws-efa-installer \ | ||
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ | ||
&& rm -rf $HOME/aws-efa-installer | ||
|
||
# Install NCCL | ||
RUN apt-key del 7fa2af80 \ | ||
&& curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \ | ||
&& dpkg -i cuda-keyring_1.0-1_all.deb \ | ||
&& sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2 | ||
|
||
## Install AWS-OFI-NCCL plugin | ||
RUN export OPAL_PREFIX="" \ | ||
&& git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \ | ||
&& cd /opt/aws-ofi-nccl \ | ||
&& git checkout v${AWS_OFI_NCCL_VERSION}-aws \ | ||
&& ./autogen.sh \ | ||
&& ./configure --prefix=/opt/aws-ofi-nccl/install \ | ||
--with-libfabric=/opt/amazon/efa/ \ | ||
--with-cuda=/usr/local/cuda \ | ||
--with-mpi=/opt/amazon/openmpi/ \ | ||
&& make && make install | ||
|
||
# Install NCCL Tests | ||
RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ | ||
&& cd /opt/nccl-tests \ | ||
&& git checkout ${NCCL_TESTS_VERSION} \ | ||
&& make MPI=1 \ | ||
MPI_HOME=/opt/amazon/openmpi/ \ | ||
CUDA_HOME=/usr/local/cuda | ||
|
||
|
||
# Set a default command for debugging or modify as per requirements | ||
ENV NCCL_PROTO simple | ||
RUN rm -rf /var/lib/apt/lists/* | ||
ENV LD_PRELOAD /usr/lib/x86_64-linux-gnu/libnccl.so:$LD_PRELOAD | ||
|
||
COPY e2e2/test/images/nvidia/gpu_unit_tests ./gpu_unit_tests | ||
RUN chmod +x ./gpu_unit_tests/unit_test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# What | ||
|
||
gpu_unit_tests is the unit tests for gpu enabled platforms. Idea is to create compact | ||
set of tests which will cover most of performance critical aspects for gpu | ||
platforms. Test designed to run on single instance. | ||
# Usage | ||
|
||
``` | ||
# Run tests | ||
./unit_test | ||
``` | ||
|
||
**Generate test data for new instance type** | ||
|
||
Step 1: Copy the `gpu_unit_tests` folder to the EC2 instance where you want to generate the data. | ||
|
||
Step 2: Execute the following command in the `gpu_unit_tests` directory on the EC2 instance: | ||
``` | ||
GENERATE_DATA=1 ./unit_test | ||
``` | ||
Step 3: | ||
Copy the files from `tests/test_sysinfo.sh.data` (e.g., `tests/test_sysinfo.sh.data/p3.2xlarge`) to your local repository. | ||
|
||
Step 4: | ||
Create PR with the new `tests/test_sysinfo.sh.data/xxx` | ||
|
||
# Test list | ||
|
||
- test_sysinfo.sh :: Validate basic system configuration by comparing it with test config | ||
- test_numa_topo_topo :: check cpu/numa topology | ||
- test_nvidia_gpu_count :: fail if one of GPUs is broken or is not visiable | ||
- test_nvidia_fabric_status :: fail if fabric manager is not active | ||
- test_nvidia_smi_topo :: fail if nvidia-smi topology is differ | ||
- test_nvidia_persistence_status :: validate persistence state | ||
- test_nvidia_gpu_unused :: Check that no other process are using GPUs, fail is a signal system misconfiguration. | ||
|
||
|
||
- 10_test_basic_cuda.sh :: Execute trivial cuda binaries, fail if cuda subsys is not healthy | ||
Use demo-suite binaries https://docs.nvidia.com/cuda/demo-suite/index.html and DCGM Diagnostics https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests | ||
If this test suite fail this is a sign that cuda subsystem is not usable at all. | ||
Usually this is side effect of system misconfiguration (driver or fabric manager is not loaded) | ||
- test_01_device_query | ||
- test_02_vector_add | ||
- test_03_bandwidth | ||
- test_04_bus_grind | ||
- test_05_dcgm_diagnostics | ||
|
||
|
Oops, something went wrong.