diff --git a/e2e2/test/cases/nvidia/manifests/job-unit-test-single-node.yaml b/e2e2/test/cases/nvidia/manifests/job-unit-test-single-node.yaml new file mode 100644 index 000000000..5bf9a8a5b --- /dev/null +++ b/e2e2/test/cases/nvidia/manifests/job-unit-test-single-node.yaml @@ -0,0 +1,28 @@ +kind: Job +apiVersion: batch/v1 +metadata: + name: unit-test-job + labels: + app: unit-test-job +spec: + template: + metadata: + labels: + app: unit-test-job + spec: + containers: + - name: unit-test-container + image: public.ecr.aws/o5d5x8n6/weicongw:nvidia + command: + - /bin/bash + - ./gpu_unit_tests/unit_test + imagePullPolicy: Always + resources: + limits: + cpu: "4" + memory: 4Gi + requests: + cpu: "1" + memory: 1Gi + restartPolicy: Never + backoffLimit: 4 \ No newline at end of file diff --git a/e2e2/test/images/nvidia/Dockerfile b/e2e2/test/images/nvidia/Dockerfile new file mode 100644 index 000000000..7753c9f6b --- /dev/null +++ b/e2e2/test/images/nvidia/Dockerfile @@ -0,0 +1,100 @@ +# Start with the NVIDIA CUDA base image +FROM nvidia/cuda:12.5.0-devel-ubuntu22.04 + +ARG EFA_INSTALLER_VERSION=latest +# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0 +ARG AWS_OFI_NCCL_VERSION=1.9.1 +ARG NCCL_TESTS_VERSION=master + +# Install necessary dependencies +RUN apt-get update -y +RUN apt-get remove -y --allow-change-held-packages \ + libmlx5-1 \ + ibverbs-utils \ + libibverbs-dev \ + libibverbs1 \ + libnccl2 \ + libnccl-dev + +RUN rm -rf /opt/hpcx \ + && rm -rf /usr/local/mpi \ + && rm -rf /usr/local/ucx \ + && rm -f /etc/ld.so.conf.d/hpcx.conf \ + && ldconfig + +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + sudo \ + git \ + gcc \ + vim \ + kmod \ + openssh-client \ + openssh-server \ + build-essential \ + wget curl \ + autoconf \ + libtool \ + gdb \ + automake \ + python3-distutils \ + cmake \ + apt-utils \ + devscripts \ + debhelper \ + libsubunit-dev \ + check \ + pkg-config \ + libhwloc-dev \ + datacenter-gpu-manager \ + cloud-utils \ + cuda-demo-suite-12-5 + +RUN mkdir -p /var/run/sshd +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config +ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH +ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH + +# Install EFA +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && rm -rf $HOME/aws-efa-installer + +# Install NCCL +RUN apt-key del 7fa2af80 \ + && curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \ + && dpkg -i cuda-keyring_1.0-1_all.deb \ + && sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2 + +## Install AWS-OFI-NCCL plugin +RUN export OPAL_PREFIX="" \ + && git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \ + && cd /opt/aws-ofi-nccl \ + && git checkout v${AWS_OFI_NCCL_VERSION}-aws \ + && ./autogen.sh \ + && ./configure --prefix=/opt/aws-ofi-nccl/install \ + --with-libfabric=/opt/amazon/efa/ \ + --with-cuda=/usr/local/cuda \ + --with-mpi=/opt/amazon/openmpi/ \ + && make && make install + +# Install NCCL Tests +RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ + && cd /opt/nccl-tests \ + && git checkout ${NCCL_TESTS_VERSION} \ + && make MPI=1 \ + MPI_HOME=/opt/amazon/openmpi/ \ + CUDA_HOME=/usr/local/cuda + + +# Set a default command for debugging or modify as per requirements +ENV NCCL_PROTO simple +RUN rm -rf /var/lib/apt/lists/* +ENV LD_PRELOAD /usr/lib/x86_64-linux-gnu/libnccl.so:$LD_PRELOAD + +COPY e2e2/test/images/nvidia/gpu_unit_tests ./gpu_unit_tests +RUN chmod +x ./gpu_unit_tests/unit_test \ No newline at end of file diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/README.md b/e2e2/test/images/nvidia/gpu_unit_tests/README.md new file mode 100644 index 000000000..d40993549 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/README.md @@ -0,0 +1,48 @@ +# What + +gpu_unit_tests is the unit tests for gpu enabled platforms. Idea is to create compact +set of tests which will cover most of performance critical aspects for gpu +platforms. Test designed to run on single instance. +# Usage + +``` +# Run tests +./unit_test +``` + +**Generate test data for new instance type** + +Step 1: Copy the `gpu_unit_tests` folder to the EC2 instance where you want to generate the data. + +Step 2: Execute the following command in the `gpu_unit_tests` directory on the EC2 instance: +``` +GENERATE_DATA=1 ./unit_test +``` +Step 3: +Copy the files from `tests/test_sysinfo.sh.data` (e.g., `tests/test_sysinfo.sh.data/p3.2xlarge`) to your local repository. + +Step 4: +Create PR with the new `tests/test_sysinfo.sh.data/xxx` + +# Test list + +- test_sysinfo.sh :: Validate basic system configuration by comparing it with test config + - test_numa_topo_topo :: check cpu/numa topology + - test_nvidia_gpu_count :: fail if one of GPUs is broken or is not visiable + - test_nvidia_fabric_status :: fail if fabric manager is not active + - test_nvidia_smi_topo :: fail if nvidia-smi topology is differ + - test_nvidia_persistence_status :: validate persistence state + - test_nvidia_gpu_unused :: Check that no other process are using GPUs, fail is a signal system misconfiguration. + + +- 10_test_basic_cuda.sh :: Execute trivial cuda binaries, fail if cuda subsys is not healthy + Use demo-suite binaries https://docs.nvidia.com/cuda/demo-suite/index.html and DCGM Diagnostics https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests + If this test suite fail this is a sign that cuda subsystem is not usable at all. + Usually this is side effect of system misconfiguration (driver or fabric manager is not loaded) + - test_01_device_query + - test_02_vector_add + - test_03_bandwidth + - test_04_bus_grind + - test_05_dcgm_diagnostics + + diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/bash_unit b/e2e2/test/images/nvidia/gpu_unit_tests/bash_unit new file mode 100644 index 000000000..d341de2da --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/bash_unit @@ -0,0 +1,613 @@ +#!/usr/bin/env bash +# +# bash unit testing enterprise edition framework for professionals +# Copyright (C) 2011-2016 Pascal Grange +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +# https://github.com/pgrange/bash_unit + +VERSION=v2.1.0 + +ESCAPE=$(printf "\033") +NOCOLOR="${ESCAPE}[0m" +RED="${ESCAPE}[91m" +GREEN="${ESCAPE}[92m" +YELLOW="${ESCAPE}[93m" +BLUE="${ESCAPE}[94m" + +# Make bash_unit immune to some basic unix commands faking +CAT="$(which cat)" +SED="$(which sed)" +GREP="$(which grep)" +RM="$(which rm)" +SHUF="$(which shuf)" + +fail() { + local message=${1:-} + local stdout=${2:-} + local stderr=${3:-} + + notify_test_failed "$__bash_unit_current_test__" "$message" + [[ ! -z $stdout ]] && [ -s "$stdout" ] && notify_stdout < "$stdout" + [[ ! -z $stderr ]] && [ -s "$stderr" ] && notify_stderr < "$stderr" + + stacktrace | notify_stack + exit 1 +} + +skip() { + local message=${1:-} + notify_test_skipped "$__bash_unit_current_test__" "$message" + echo "skipped $message" > $__bash_unit_test_skipped__ + exit 1 +} + +_notify_trace() { + local caller_shift=$1 + local message=${2} + local stdout=${3:-} + local stderr=${4:-} + + [ -z $trace_file ] && return + + caller_hdr="" + cl=$((caller_shift + 2)) + + if [ -n ${BASH_SOURCE[$cl]} ] + then + caller_hdr="${BASH_SOURCE[$cl]}:${BASH_LINENO[$((cl-1))]}" + fi + echo "trace:${caller_hdr}> $message" >> $trace_file + [[ ! -z $stdout ]] && [ -s "$stdout" ] && "$SED" 's:^:trace-out> :' < "$stdout" >> $trace_file + [[ ! -z $stderr ]] && [ -s "$stderr" ] && "$SED" 's:^:trace-err> :' < "$stderr" >> $trace_file +} + +notify_trace_dbg() { + _notify_trace 0 "$1" +} + +notify_trace_info() { + [ -z $trace_file ] && return + + local message=${1:-} + echo "info> $message" >> $trace_file +} + +assert() { + local assertion=$1 + local message=${2:-} + + _assert_expression \ + "$assertion" \ + "[ \$status == 0 ]" \ + "\"$message\"" +} + +assert_fails() { + local assertion=$1 + local message=${2:-} + + _assert_expression \ + "$assertion" \ + "[ \$status != 0 ]" \ + "\"$message\"" +} + +assert_fail() { + #deprecated, use assert_fails instead + assert_fails "$@" +} + +assert_status_code() { + local expected_status=$1 + local assertion="$2" + local message="${3:-}" + + _assert_expression \ + "$assertion" \ + "[ \$status == $expected_status ]" \ + "\"$message\" expected status code $expected_status but was \$status" +} + +_assert_expression() { + local assertion=$1 + local condition=$2 + local message=$3 + ( + local stdout=$(mktemp) + local stderr=$(mktemp) + trap "$RM -f \"$stdout\" \"$stderr\"" EXIT + + local status + eval "($assertion)" >"$stdout" 2>"$stderr" && status=$? || status=$? + _notify_trace 1 "assert_expression: exp: '$assertion', cond: '$condition', status: '$status'" "$stdout" "$stderr" + + if ! eval "$condition" + then + fail "$(eval echo $message)" "$stdout" "$stderr" + fi + ) || exit $? +} + +assert_equals() { + local expected=$1 + local actual=$2 + local message=${3:-} + [[ -z $message ]] || message="$message\n" + + notify_trace_dbg "assert_equals '$expected' == '$actual'" + if [ "$expected" != "$actual" ] + then + fail "$message expected [$expected] but was [$actual]" + fi +} + +assert_not_equals() { + local unexpected="$1" + local actual="$2" + local message=${3:-} + [[ -z $message ]] || message="$message\n" + + notify_trace_dbg "assert_not_equals: '$unexpected' != '$actual'" + [ "$unexpected" != "$actual" ] || \ + fail "$message expected different value than [$unexpected] but was the same" +} + +assert_matches() { + local expected=$1 + local actual=$2 + local message=${3:-} + [[ -z $message ]] || message="$message\n" + + notify_trace_dbg "assert_matches: '$actual' =~ '$expected'" + if [[ ! "${actual}" =~ ${expected} ]]; then + fail "$message expected regex [$expected] to match [$actual]" + fi +} + +assert_not_matches() { + local unexpected=$1 + local actual=$2 + local message=${3:-} + [[ -z $message ]] || message="$message\n" + + _notify_trace 0 "assert_not_matches: ! '$actual' =~ '$unexpected'" + if [[ "${actual}" =~ ${unexpected} ]]; then + fail "$message expected regex [$unexpected] should not match but matched [$actual]" + fi +} + +assert_within_delta() { + function abs() { + local value=$1 + local sign=$(( value < 0 ? -1 : 1 )) + echo $((value * sign)) + } + function is_number() { + local value=$1 + test $value -eq $value 2>/dev/null + } + local expected=$1 + local actual=$2 + local max_delta=$3 + assert "is_number $expected" "$message expected value [$expected] is not a number" + assert "is_number $actual" "$message actual value [$actual] is not a number" + assert "is_number $max_delta" "$message max_delta [$max_delta] is not a number" + local message=${4:-} + [[ -z $message ]] || message="$message\n" + + local actual_delta="$(abs $(($expected - $actual)))" + + if (( $actual_delta > $max_delta )); then + fail "$message expected value [$expected] to match [$actual] with a maximum delta of [$max_delta]" + fi +} + +assert_no_diff() { + local expected=$1 + local actual=$2 + local message=${3:-} + [[ -z $message ]] || message="$message\n" + + assert 'diff '"${expected}"' '"${actual}" \ + "$message expected '"${actual}"' to be identical to '"${expected}"' but was different" +} + +fake() { + local command=$1 + shift + if [ $# -gt 0 ] + then + eval "function $command() { export FAKE_PARAMS=(\"\$@\") ; $@ ; }" + else + eval "function $command() { echo \"$($CAT)\" ; }" + fi + export -f $command +} + +stacktrace() { + local i=1 + while ! [ -z "${BASH_SOURCE[$i]:-}" ] + do + echo ${BASH_SOURCE[$i]}:${BASH_LINENO[$((i-1))]}:${FUNCNAME[$i]}\(\) + i=$((i + 1)) + done | "$GREP" -v "^$BASH_SOURCE" +} + +run_test_suite() { + local failure=0 + + if run_setup_suite + then + run_tests || failure=$? + else + failure=$? + fi + run_teardown_suite + + return $failure +} + +run_setup_suite() { + if declare -F | "$GREP" ' setup_suite$' >/dev/null + then + setup_suite + fi +} + +maybe_shuffle() { + ((randomise)) && $SHUF || $CAT +} + +run_tests() { + local failure=0 + + for pending_test in $(set | "$GREP" -E '^(pending|todo).* \(\)' | "$GREP" -E "$test_pattern" | "$SED" -e 's: .*::') + do + notify_test_starting "$pending_test" + notify_test_pending "$pending_test" + done + + + for test in $(set | "$GREP" -E '^test.* \(\)' | "$GREP" -E "$test_pattern" | "$SED" -e 's: .*::' | maybe_shuffle) + do + ( + local status=0 + declare -F | "$GREP" ' setup$' >/dev/null && setup + __bash_unit_test_skipped__=$(mktemp) + trap "$RM -f \"$stdout\" \"$stderr\"" EXIT + (__bash_unit_current_test__="$test" run_test) || status=$? + test -s $__bash_unit_test_skipped__ && status=0 + declare -F | "$GREP" ' teardown$' >/dev/null && teardown + exit $status + ) + failure=$(( $? || failure)) + done + return $failure +} + +run_test() { + set -e + notify_test_starting "$__bash_unit_current_test__" + "$__bash_unit_current_test__" && notify_test_succeeded "$__bash_unit_current_test__" +} + +run_teardown_suite() { + if declare -F | "$GREP" ' teardown_suite$' >/dev/null + then + teardown_suite + fi +} + +usage() { + echo "$1" >&2 + echo "$0 [-f ] [-p ] [-p ] [-r] ... ..." >&2 + echo >&2 + echo "Runs tests in test files that match s" >&2 + echo " is optional only supported value is tap" >&2 + echo "-r to execute test cases in random order" >&2 + echo "-v to get current version information" >&2 + echo "See https://github.com/pgrange/bash_unit" >&2 + exit 1 +} + +# Formating + +pretty_success() { + pretty_format "$GREEN" "\u2713" "${1:-}" +} + +pretty_warning() { + pretty_format "$YELLOW" "\u2717" "$1" +} + +pretty_failure() { + pretty_format "$RED" "\u2717" "${1:-}" +} + +pretty_format() { + local color="$1" + local pretty_symbol="$2" + local alt_symbol="${3:-}" + local term_utf8=false +#env + if is_terminal && [[ "${LANG:-}" =~ .*UTF-8.* ]] + then + term_utf8=true + fi + ( + $CAT + if $term_utf8 + then + echo -en " $pretty_symbol " + else + [[ ! -z "$alt_symbol" ]] && echo -en " $alt_symbol " + fi + ) | color "$color" +} + +color() { + _start_color() { + if is_terminal ; then echo -en "$color" ; fi + } + _stop_color() { + if is_terminal ; then echo -en "$NOCOLOR" ; fi + } + local color=$1 + shift + _start_color + if [ $# -gt 0 ] + then + echo $* + else + $CAT + fi + _stop_color +} + +is_terminal() { + [ -t 1 ] || [[ "${FORCE_COLOR:-}" == true ]] +} + +trace_suite_starting() { + local test_file="$1" + notify_trace_info "Running tests in $test_file" + } +trace_test_starting() { + local test="$1" + notify_trace_info "Running $test" +} +trace_test_pending() { + local test="$1" + notify_trace_info "Pending $test" +} + +trace_test_skipped() { + local test="$1" + local message="$2" + notify_trace_info "Skip $test message: $message" +} + +trace_test_succeeded() { + local test="$1" + notify_trace_info "Success $test" +} +trace_test_failed() { + local test="$1" + local message="$2" + notify_trace_info "$test with message: $message" +} +trace_suites_succeded() { + notify_trace_info "Overall result: SUCCESS" +} +trace_suites_failed() { + notify_trace_info "Overall result: FAILURE" +} + +text_format() { + notify_suite_starting() { + local test_file="$1" + trace_suite_starting $test_file + echo "Running tests in $test_file" + } + notify_test_starting() { + local test="$1" + trace_test_starting $test + echo -e -n "\tRunning $test ... " | color "$BLUE" + } + notify_test_pending() { + local test="$1" + trace_test_pending "$test" + echo -n "PENDING" | pretty_warning + echo + } + notify_test_skipped() { + local test="$1" + local message="$2" + trace_test_skipped "$test" "$message" + echo -n "SKIPPED" | pretty_warning + [[ -z $message ]] || printf -- "$message\n" + echo + } + + notify_test_succeeded() { + local test="$1" + trace_test_succeeded "$test" + echo -n "SUCCESS" | pretty_success + echo + } + notify_test_failed() { + local test="$1" + local message="$2" + trace_test_failed "$test" "$message" + echo -n "FAILURE" | pretty_failure + echo + [[ -z $message ]] || printf -- "$message\n" + } + notify_stdout() { + "$SED" 's:^:out> :' | color "$GREEN" + } + notify_stderr() { + "$SED" 's:^:err> :' | color "$RED" + } + notify_stack() { + color "$YELLOW" + } + notify_suites_succeded() { + trace_suites_succeded + echo -n "Overall result: SUCCESS" | pretty_success + echo + } + notify_suites_failed() { + trace_suites_failed + echo -n "Overall result: FAILURE" | pretty_failure + echo + } +} + +tap_format() { + notify_suite_starting() { + local test_file="$1" + trace_suite_starting + echo "# Running tests in $test_file" + } + notify_test_starting() { + trace_test_starting $1 + } + notify_test_pending() { + local test="$1" + trace_test_pending "$test" + echo -n "ok" | pretty_warning - + echo -n "$test" | color "$BLUE" + echo " # skip test to be written" | color "$YELLOW" + } + notify_test_skipped() { + local test="$1" + local message="$2" + trace_test_skipped "$test" "$message" + echo -n "ok" | pretty_warning - + echo -n "$test" | color "$BLUE" + echo " # skip ${message}" | color "$YELLOW" + } + + notify_test_succeeded() { + local test="$1" + trace_test_succeeded "$test" + echo -n "ok" | pretty_success - + echo "$test" | color "$BLUE" + } + notify_test_failed() { + local test="$1" + local message="$2" + trace_test_failed "$test" "$message" + echo -n "not ok" | pretty_failure - + echo "$test" | color "$BLUE" + [[ -z $message ]] || printf -- "$message\n" | "$SED" -u -e 's/^/# /' + } + notify_stdout() { + "$SED" 's:^:# out> :' | color "$GREEN" + } + notify_stderr() { + "$SED" 's:^:# err> :' | color "$RED" + } + notify_stack() { + "$SED" 's:^:# :' | color "$YELLOW" + } + notify_suites_succeded() { + trace_suites_succeded + } + notify_suites_failed() { + trace_suites_failed + } +} + +output_format=text +test_pattern="" +trace_file="" +separator="" +randomise=0 +while getopts "vp:t:f:r" option +do + case "$option" in + p) + test_pattern="${test_pattern}${separator}${OPTARG}" + separator="|" + ;; + t) + trace_file="$(realpath ${OPTARG})" + truncate -s0 "$trace_file" + ;; + f) + output_format="${OPTARG}" + ;; + r) + randomise=1 + ;; + v) + echo "bash_unit $VERSION" + exit + ;; + ?|:) + usage + ;; + esac +done +shift $((OPTIND-1)) + +for test_file in "$@" +do + test -e "$test_file" || usage "file does not exist: $test_file" + test -r "$test_file" || usage "can not read file: $test_file" +done + +case "$output_format" in + text) + text_format + ;; + tap) + tap_format + ;; + *) + usage "unsupported output format: $output_format" + ;; +esac + +#run tests received as parameters +failure=0 +for test_file in "$@" +do + notify_suite_starting "$test_file" + ( + set -e # Ensure bash_unit will exit with failure + # in case of syntax error. + if [[ "${STICK_TO_CWD}" != true ]] + then + cd "$(dirname "$test_file")" + source "$(basename "$test_file")" + else + source "$test_file" + fi + set +e + run_test_suite + ) + failure=$(( $? || failure)) +done + +if ((failure)) +then + notify_suites_failed +else + notify_suites_succeded +fi + +exit $failure diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/common.sh b/e2e2/test/images/nvidia/gpu_unit_tests/tests/common.sh new file mode 100644 index 000000000..5572dcdb9 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/common.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +get_instance_type() +{ + # Retrieve instance metadata: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html#instance-metadata-retrieval-examples + [ -n "$FORCE_INSTANCE_TYPE" ] && echo $FORCE_INSTANCE_TYPE + local token=`curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600"` + curl -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/instance-type +} + +assert_gpu_unused() +{ + cmd="nvidia-smi --query-compute-apps timestamp,gpu_bus_id,gpu_uuid,pid,name,used_memory --format csv,noheader" + assert_equals "" "`$cmd`" "gpu is busy by other task, system misconfig?" +} + +_assert_data() +{ + local expected="$1" + local cmd="$2" + local message="${3:-}" + local cmd_out="$ACTUAL_RESULTS/$(basename $expected)" + [[ -z $message ]] || message="$message\n" + + eval "$cmd" > $cmd_out + diff_cmd="diff -up $expected $cmd_out" + diff_out="`$diff_cmd`" + + notify_trace_dbg "_assert_data $diff_cmd, out: $diff_out" + if [ -n "$diff_out" ] + then + fail "$message test data value diff:\n$diff_out" + fi +} + +assert_data() { + _assert_data "$1" "$2" "$3" +} + +generate_data() +{ + local expected="$1" + local cmd="$2" + local msg="$3" + local cmd_out="$ACTUAL_RESULTS/$(basename $expected)" + + eval "$cmd" > $expected + _assert_data "$expected" "$cmd" "$msg" +} \ No newline at end of file diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_basic.sh b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_basic.sh new file mode 100644 index 000000000..db982df5a --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_basic.sh @@ -0,0 +1,41 @@ +# Trivial cuda tests to validate that GPU it functional +# Use demu-suite binaries https://docs.nvidia.com/cuda/demo-suite/index.html +# and DCGM Diagnostics https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests + +setup_suite() +{ + source common.sh + assert_gpu_unused + DEMU_SUITE_DIR=${DEMU_SUITE_DIR:-$(realpath /usr/local/cuda/extras/demo_suite)} +} + +teardown_suite() +{ + assert_gpu_unused +} + +test_01_device_query() +{ + assert_status_code 0 "$DEMU_SUITE_DIR/deviceQuery" +} + +test_02_vector_add() +{ + assert_status_code 0 "$DEMU_SUITE_DIR/vectorAdd" +} + +test_03_bandwidth() +{ + assert_status_code 0 "$DEMU_SUITE_DIR/bandwidthTest --device=all --csv" +} + +test_04_bus_grind() +{ + assert_status_code 0 "$DEMU_SUITE_DIR/busGrind -a" +} + +test_05_dcgm_diagnostics() +{ + # https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests + assert_status_code 0 "dcgmi diag -r 2" +} \ No newline at end of file diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh new file mode 100644 index 000000000..bd0b1b278 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh @@ -0,0 +1,69 @@ +# Validate basic system configuration by comparing with expected config +# +setup_suite() +{ + source common.sh + + EC2_INSTANCE_TYPE=$(get_instance_type) + data=test_sysinfo.sh.data/$EC2_INSTANCE_TYPE + ACTUAL_RESULTS=`mktemp -t -d test_sysinfo.sh.actual-data.XXX` + assert_not_equals "" "$ACTUAL_RESULTS" + notify_trace_info "ACTUAL_RESULTS: $ACTUAL_RESULTS" + + if [ -n "$GENERATE_DATA" ] + then + echo "GENERATE_DATA is enabled..." + mkdir -p $data + function assert_data() { + generate_data "$@" + } + fi +} + +teardown_suite() +{ + assert "test -z \"$GENERATE_DATA\"" "GENERATE_DATA was enabled, fail full suite" + assert_gpu_unused +} + + +test_numa_topo_topo() +{ + assert_data $data/numa_topo.txt "grep . /sys/devices/system/node/node*/{cpulist,distance}" "Unexpected cpu topology" +} + +test_nvidia_gpu_count() +{ + #Just for logging purposesclear + assert_status_code 0 "nvidia-smi -q" + assert_data $data/gpu_count.txt "nvidia-smi --query-gpu=name,index,pci.bus_id --format csv" "Unexpected gpu count" +} + + +test_nvidia_smi_topo() +{ + assert_data $data/nvidia_smi_topo.txt "nvidia-smi topo -m | grep GPU | cut -f 1-11" \ + "Unexpected gpu topology, likely broken nvlinks" +} + + +test_nvidia_persistence_status() +{ + assert_data $data/nvidia_persistence_status.txt "nvidia-smi --query-gpu=name,pci.bus_id,persistence_mode --format=csv" \ + "Unexpected perfistance status, likely system configuration issue" +} + +test_nvidia_gpu_unused() +{ + assert_gpu_unused +} + +test_nvidia_gpu_throttled() +{ + + # https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons + # The only bit allowed is nvmlClocksEventReasonGpuIdle 0x0000000000000001LL + filter="egrep -v -e '(0x0000000000000000|0x0000000000000001)'" + cmd="nvidia-smi --query-gpu index,gpu_bus_id,gpu_uuid,clocks_throttle_reasons.active --format=csv,noheader" + assert_status_code 1 "$cmd | $filter" "Throttled gpu detected, possible reason https://tt.amazon.com/P115211285" +} diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/gpu_count.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/gpu_count.txt new file mode 100644 index 000000000..bd419e4f3 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/gpu_count.txt @@ -0,0 +1,9 @@ +name, index, pci.bus_id +NVIDIA A10G, 0, 00000000:00:16.0 +NVIDIA A10G, 1, 00000000:00:17.0 +NVIDIA A10G, 2, 00000000:00:18.0 +NVIDIA A10G, 3, 00000000:00:19.0 +NVIDIA A10G, 4, 00000000:00:1A.0 +NVIDIA A10G, 5, 00000000:00:1B.0 +NVIDIA A10G, 6, 00000000:00:1C.0 +NVIDIA A10G, 7, 00000000:00:1D.0 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/numa_topo.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/numa_topo.txt new file mode 100644 index 000000000..389792102 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/numa_topo.txt @@ -0,0 +1,4 @@ +/sys/devices/system/node/node0/cpulist:0-47,96-143 +/sys/devices/system/node/node1/cpulist:48-95,144-191 +/sys/devices/system/node/node0/distance:10 32 +/sys/devices/system/node/node1/distance:32 10 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/nvidia_persistence_status.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/nvidia_persistence_status.txt new file mode 100644 index 000000000..1d4dc2737 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/nvidia_persistence_status.txt @@ -0,0 +1,9 @@ +name, pci.bus_id, persistence_mode +NVIDIA A10G, 00000000:00:16.0, Enabled +NVIDIA A10G, 00000000:00:17.0, Enabled +NVIDIA A10G, 00000000:00:18.0, Enabled +NVIDIA A10G, 00000000:00:19.0, Enabled +NVIDIA A10G, 00000000:00:1A.0, Enabled +NVIDIA A10G, 00000000:00:1B.0, Enabled +NVIDIA A10G, 00000000:00:1C.0, Enabled +NVIDIA A10G, 00000000:00:1D.0, Enabled diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/nvidia_smi_topo.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/nvidia_smi_topo.txt new file mode 100644 index 000000000..b2d8effc8 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/nvidia_smi_topo.txt @@ -0,0 +1,9 @@ + GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity +GPU0 X PHB PHB PHB PHB PHB PHB PHB 0-191 0-1 +GPU1 PHB X PHB PHB PHB PHB PHB PHB 0-191 0-1 +GPU2 PHB PHB X PHB PHB PHB PHB PHB 0-191 0-1 +GPU3 PHB PHB PHB X PHB PHB PHB PHB 0-191 0-1 +GPU4 PHB PHB PHB PHB X PHB PHB PHB 0-191 0-1 +GPU5 PHB PHB PHB PHB PHB X PHB PHB 0-191 0-1 +GPU6 PHB PHB PHB PHB PHB PHB X PHB 0-191 0-1 +GPU7 PHB PHB PHB PHB PHB PHB PHB X 0-191 0-1 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/gpu_count.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/gpu_count.txt new file mode 100644 index 000000000..f7fdf0cfa --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/gpu_count.txt @@ -0,0 +1,2 @@ +name, index, pci.bus_id +Tesla V100-SXM2-16GB, 0, 00000000:00:1E.0 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/numa_topo.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/numa_topo.txt new file mode 100644 index 000000000..d72d887a6 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/numa_topo.txt @@ -0,0 +1,2 @@ +/sys/devices/system/node/node0/cpulist:0-7 +/sys/devices/system/node/node0/distance:10 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/nvidia_persistence_status.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/nvidia_persistence_status.txt new file mode 100644 index 000000000..52d4962a6 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/nvidia_persistence_status.txt @@ -0,0 +1,2 @@ +name, pci.bus_id, persistence_mode +Tesla V100-SXM2-16GB, 00000000:00:1E.0, Enabled diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/nvidia_smi_topo.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/nvidia_smi_topo.txt new file mode 100644 index 000000000..e0cfd1955 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/nvidia_smi_topo.txt @@ -0,0 +1,2 @@ + GPU0 CPU Affinity NUMA Affinity GPU NUMA ID +GPU0 X 0-7 0 N/A diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/gpu_count.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/gpu_count.txt new file mode 100644 index 000000000..cea7a3958 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/gpu_count.txt @@ -0,0 +1,9 @@ +name, index, pci.bus_id +NVIDIA A100-SXM4-40GB, 0, 00000000:10:1C.0 +NVIDIA A100-SXM4-40GB, 1, 00000000:10:1D.0 +NVIDIA A100-SXM4-40GB, 2, 00000000:20:1C.0 +NVIDIA A100-SXM4-40GB, 3, 00000000:20:1D.0 +NVIDIA A100-SXM4-40GB, 4, 00000000:90:1C.0 +NVIDIA A100-SXM4-40GB, 5, 00000000:90:1D.0 +NVIDIA A100-SXM4-40GB, 6, 00000000:A0:1C.0 +NVIDIA A100-SXM4-40GB, 7, 00000000:A0:1D.0 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/numa_topo.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/numa_topo.txt new file mode 100644 index 000000000..f1d33d089 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/numa_topo.txt @@ -0,0 +1,4 @@ +/sys/devices/system/node/node0/cpulist:0-23,48-71 +/sys/devices/system/node/node1/cpulist:24-47,72-95 +/sys/devices/system/node/node0/distance:10 21 +/sys/devices/system/node/node1/distance:21 10 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/nvidia_persistence_status.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/nvidia_persistence_status.txt new file mode 100644 index 000000000..9269616cf --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/nvidia_persistence_status.txt @@ -0,0 +1,9 @@ +name, pci.bus_id, persistence_mode +NVIDIA A100-SXM4-40GB, 00000000:10:1C.0, Enabled +NVIDIA A100-SXM4-40GB, 00000000:10:1D.0, Enabled +NVIDIA A100-SXM4-40GB, 00000000:20:1C.0, Enabled +NVIDIA A100-SXM4-40GB, 00000000:20:1D.0, Enabled +NVIDIA A100-SXM4-40GB, 00000000:90:1C.0, Enabled +NVIDIA A100-SXM4-40GB, 00000000:90:1D.0, Enabled +NVIDIA A100-SXM4-40GB, 00000000:A0:1C.0, Enabled +NVIDIA A100-SXM4-40GB, 00000000:A0:1D.0, Enabled diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/nvidia_smi_topo.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/nvidia_smi_topo.txt new file mode 100644 index 000000000..20f8cee5b --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/nvidia_smi_topo.txt @@ -0,0 +1,9 @@ + GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity +GPU0 X NV12 NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 +GPU1 NV12 X NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 +GPU2 NV12 NV12 X NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 +GPU3 NV12 NV12 NV12 X NV12 NV12 NV12 NV12 0-23,48-71 0 +GPU4 NV12 NV12 NV12 NV12 X NV12 NV12 NV12 24-47,72-95 1 +GPU5 NV12 NV12 NV12 NV12 NV12 X NV12 NV12 24-47,72-95 1 +GPU6 NV12 NV12 NV12 NV12 NV12 NV12 X NV12 24-47,72-95 1 +GPU7 NV12 NV12 NV12 NV12 NV12 NV12 NV12 X 24-47,72-95 1 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/gpu_count.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/gpu_count.txt new file mode 100644 index 000000000..23568aba7 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/gpu_count.txt @@ -0,0 +1,9 @@ +name, index, pci.bus_id +NVIDIA A100-SXM4-80GB, 0, 00000000:10:1C.0 +NVIDIA A100-SXM4-80GB, 1, 00000000:10:1D.0 +NVIDIA A100-SXM4-80GB, 2, 00000000:20:1C.0 +NVIDIA A100-SXM4-80GB, 3, 00000000:20:1D.0 +NVIDIA A100-SXM4-80GB, 4, 00000000:90:1C.0 +NVIDIA A100-SXM4-80GB, 5, 00000000:90:1D.0 +NVIDIA A100-SXM4-80GB, 6, 00000000:A0:1C.0 +NVIDIA A100-SXM4-80GB, 7, 00000000:A0:1D.0 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/numa_topo.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/numa_topo.txt new file mode 100644 index 000000000..f1d33d089 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/numa_topo.txt @@ -0,0 +1,4 @@ +/sys/devices/system/node/node0/cpulist:0-23,48-71 +/sys/devices/system/node/node1/cpulist:24-47,72-95 +/sys/devices/system/node/node0/distance:10 21 +/sys/devices/system/node/node1/distance:21 10 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/nvidia_persistence_status.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/nvidia_persistence_status.txt new file mode 100644 index 000000000..4b0cbfc52 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/nvidia_persistence_status.txt @@ -0,0 +1,9 @@ +name, pci.bus_id, persistence_mode +NVIDIA A100-SXM4-80GB, 00000000:10:1C.0, Enabled +NVIDIA A100-SXM4-80GB, 00000000:10:1D.0, Enabled +NVIDIA A100-SXM4-80GB, 00000000:20:1C.0, Enabled +NVIDIA A100-SXM4-80GB, 00000000:20:1D.0, Enabled +NVIDIA A100-SXM4-80GB, 00000000:90:1C.0, Enabled +NVIDIA A100-SXM4-80GB, 00000000:90:1D.0, Enabled +NVIDIA A100-SXM4-80GB, 00000000:A0:1C.0, Enabled +NVIDIA A100-SXM4-80GB, 00000000:A0:1D.0, Enabled diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/nvidia_smi_topo.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/nvidia_smi_topo.txt new file mode 100644 index 000000000..20f8cee5b --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/nvidia_smi_topo.txt @@ -0,0 +1,9 @@ + GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity +GPU0 X NV12 NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 +GPU1 NV12 X NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 +GPU2 NV12 NV12 X NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 +GPU3 NV12 NV12 NV12 X NV12 NV12 NV12 NV12 0-23,48-71 0 +GPU4 NV12 NV12 NV12 NV12 X NV12 NV12 NV12 24-47,72-95 1 +GPU5 NV12 NV12 NV12 NV12 NV12 X NV12 NV12 24-47,72-95 1 +GPU6 NV12 NV12 NV12 NV12 NV12 NV12 X NV12 24-47,72-95 1 +GPU7 NV12 NV12 NV12 NV12 NV12 NV12 NV12 X 24-47,72-95 1 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/gpu_count.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/gpu_count.txt new file mode 100644 index 000000000..474e15281 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/gpu_count.txt @@ -0,0 +1,9 @@ +name, index, pci.bus_id +NVIDIA H100 80GB HBM3, 0, 00000000:53:00.0 +NVIDIA H100 80GB HBM3, 1, 00000000:64:00.0 +NVIDIA H100 80GB HBM3, 2, 00000000:75:00.0 +NVIDIA H100 80GB HBM3, 3, 00000000:86:00.0 +NVIDIA H100 80GB HBM3, 4, 00000000:97:00.0 +NVIDIA H100 80GB HBM3, 5, 00000000:A8:00.0 +NVIDIA H100 80GB HBM3, 6, 00000000:B9:00.0 +NVIDIA H100 80GB HBM3, 7, 00000000:CA:00.0 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/numa_topo.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/numa_topo.txt new file mode 100644 index 000000000..389792102 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/numa_topo.txt @@ -0,0 +1,4 @@ +/sys/devices/system/node/node0/cpulist:0-47,96-143 +/sys/devices/system/node/node1/cpulist:48-95,144-191 +/sys/devices/system/node/node0/distance:10 32 +/sys/devices/system/node/node1/distance:32 10 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/nvidia_persistence_status.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/nvidia_persistence_status.txt new file mode 100644 index 000000000..795c863e8 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/nvidia_persistence_status.txt @@ -0,0 +1,9 @@ +name, pci.bus_id, persistence_mode +NVIDIA H100 80GB HBM3, 00000000:53:00.0, Enabled +NVIDIA H100 80GB HBM3, 00000000:64:00.0, Enabled +NVIDIA H100 80GB HBM3, 00000000:75:00.0, Enabled +NVIDIA H100 80GB HBM3, 00000000:86:00.0, Enabled +NVIDIA H100 80GB HBM3, 00000000:97:00.0, Enabled +NVIDIA H100 80GB HBM3, 00000000:A8:00.0, Enabled +NVIDIA H100 80GB HBM3, 00000000:B9:00.0, Enabled +NVIDIA H100 80GB HBM3, 00000000:CA:00.0, Enabled diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/nvidia_smi_topo.txt b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/nvidia_smi_topo.txt new file mode 100644 index 000000000..3df283ce9 --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/nvidia_smi_topo.txt @@ -0,0 +1,9 @@ + GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity +GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 0-47,96-143 0 +GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 0-47,96-143 0 +GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 0-47,96-143 0 +GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 0-47,96-143 0 +GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 48-95,144-191 1 +GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 48-95,144-191 1 +GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 48-95,144-191 1 +GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X 48-95,144-191 1 diff --git a/e2e2/test/images/nvidia/gpu_unit_tests/unit_test b/e2e2/test/images/nvidia/gpu_unit_tests/unit_test new file mode 100644 index 000000000..06ec9463e --- /dev/null +++ b/e2e2/test/images/nvidia/gpu_unit_tests/unit_test @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +TRACE_LOG=trace.log +TEST_TIMEOUT=1800 +BASH="/usr/bin/bash" +CURRENT_DIR=$(pwd) + +timeout -k 10 ${TEST_TIMEOUT} ${BASH} gpu_unit_tests/bash_unit -f tap -t gpu_unit_tests/${TRACE_LOG} gpu_unit_tests/tests/*test*.sh \ No newline at end of file