canonical · motjuste · Nov 14, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 14, 2024
@@ -40,7 +40,7 @@ jobs:
           -e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \
           -e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \
           -e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \
-          ${GITHUB_WORKSPACE}/testflinger/job-def.yaml > \
+          ${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \
           ${GITHUB_WORKSPACE}/job.yaml
       - name: Build job file from template with oemscript provisioning
         if: ${{ matrix.queue == 'dell-precision-5680-c31665' }}
@@ -51,7 +51,7 @@ jobs:
           -e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \
           -e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \
           -e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \
-          ${GITHUB_WORKSPACE}/testflinger/job-def.yaml > \
+          ${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \
           ${GITHUB_WORKSPACE}/job.yaml
       - name: Submit testflinger job
         uses: canonical/testflinger/.github/actions/submit@main

@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+#  IMPORTANT: for any test using the dss command:
+#
+# - Clear PYTHON shell vars to prevent conflicts between dss
+#   and checkbox python environments
+# - Run from ${HOME} as dss writes logs to its working directory,
+#   and as a snap does not have permissions to write to the default
+#   working directory for checkbox tests
+export -n PYTHONHOME PYTHONPATH PYTHONUSERBASE
+
+check_dss_can_be_initialized() {
+    # TODO: we actually seem to initialize dss here; maybe split it out
+    cd "${HOME}"
+    dss initialize --kubeconfig="$(sudo microk8s config)"
+    echo "Test success: dss initialized."
+}
+
+check_dss_namespace_is_deployed() {
+    if microk8s.kubectl get ns | grep -q dss; then
+        echo "Test success: 'dss' namespace is deployed!"
+    else
+        >&2 echo "Test failure: no namespace named 'dss' deployed."
+        exit 1
+    fi
+}
+
+check_mlflow_status_is_ready() {
+    cd "${HOME}"
+    result=$(dss status) # save result to shell var to avoid broken pipe error
+    if echo "${result}" | grep -q "MLflow deployment: Ready"; then
+        echo "Test success: 'dss status' shows ready status for mlflow."
+    else
+        >&2 echo "Test failure: 'dss status' does not show ready status for mlflow."
+        exit 1
+    fi
+}
+
+check_mlflow_is_deployed_as_first_service() {
+    # TODO: enable mlflow to be a service in any position
+    result=$(microk8s.kubectl get service -n dss -o jsonpath='{.items[0].metadata.name}')
+    if [ "${result}" = "mlflow" ]; then
+        echo "Test success: 'mlflow' service is deployed!"
+    else
+        >&2 echo "Test failure: expected service name 'mlflow' but got ${result}"
+        exit 1
+    fi
+}
+
+check_dss_has_intel_gpu_acceleration_enabled() {
+    cd "${HOME}"
+    result=$(dss status) # save result to shell var to avoid broken pipe error
+    if echo "${result}" | grep -q "Intel GPU acceleration: Enabled"; then
+        echo "Test success: 'dss status' correctly reports Intel GPU status."
+    else
+        >&2 echo "Test failure: 'dss status' does not report that Intel GPU acceleration is enabled."
+        exit 1
+    fi
+}
+
+check_dss_can_create_itex_215_notebook() {
+    cd "${HOME}"
+    if dss create itex-215-notebook --image=intel/intel-extension-for-tensorflow:2.15.0-xpu-idp-jupyter; then
+        echo "Test success: successfully created an ITEX 2.15 notebook."
+    else
+        >&2 echo "Test failure: failed to create an ITEX 2.15 notebook."
+        exit 1
+    fi
+}
+
+check_dss_can_create_ipex_2120_notebook() {
+    cd "${HOME}"
+    if dss create ipex-2120-notebook --image=intel/intel-extension-for-pytorch:2.1.20-xpu-idp-jupyter; then
+        echo "Test success: successfully created an IPEX 2.1.20 notebook."
+    else
+        >&2 echo "Test failure: failed to create an IPEX 2.1.20 notebook."
+        exit 1
+    fi
+}
+
+help_function() {
+    echo "This script is used for generic tests related to DSS"
+    echo "Usage: check_dss.sh <test_case>"
+    echo
+    echo "Test cases currently implemented:"
+    echo -e "\t<dss_can_be_initialized>: check_dss_can_be_initialized"
+    echo -e "\t<dss_namespace_is_deployed>: check_dss_namespace_is_deployed"
+    echo -e "\t<mlflow_status_is_ready>: check_mlflow_status_is_ready"
+    echo -e "\t<mlflow_is_deployed_as_first_service>: check_mlflow_is_deployed_as_first_service"
+    echo -e "\t<intel_gpu_acceleration_is_enabled>: check_dss_has_intel_gpu_acceleration_enabled"
+    echo -e "\t<can_create_itex_215_notebook>: check_dss_can_create_itex_215_notebook"
+    echo -e "\t<can_create_ipex_2120_notebook>: check_dss_can_create_ipex_2120_notebook"
+}
+
+main() {
+    case ${1} in
+    dss_can_be_initialized) check_dss_can_be_initialized ;;
+    dss_namespace_is_deployed) check_dss_namespace_is_deployed ;;
+    mlflow_status_is_ready) check_mlflow_status_is_ready ;;
+    mlflow_is_deployed_as_first_service) check_mlflow_is_deployed_as_first_service ;;
+    intel_gpu_acceleration_is_enabled) check_dss_has_intel_gpu_acceleration_enabled ;;
+    can_create_itex_215_notebook) check_dss_can_create_itex_215_notebook ;;
+    can_create_ipex_2120_notebook) check_dss_can_create_ipex_2120_notebook ;;
+    *) help_function ;;
+    esac
+}
+
+main "$@"
@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+check_host_has_intel_gpus() {
+    result=$(intel_gpu_top -L)
+    if [[ ${result} == *"pci:vendor=8086"* ]]; then
+        echo "Test success: Intel GPU available on host: ${result}"
+    else
+        >&2 echo "Test failure: "intel_gpu_top -L" reports no Intel GPUs: ${result}"
+        exit 1
+    fi
+}
+
+check_intel_gpu_plugin_can_be_installed() {
+    # Using kubectl directly due to this bug: https://github.com/canonical/microk8s/issues/4453
+
+    # TODO: make version a param
+    VERSION=v0.30.0
+    # hack as redirecting stdout anywhere but /dev/null throws a permission denied error
+    # see: https://forum.snapcraft.io/t/eksctl-cannot-write-to-stdout/17254/4
+    kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd?ref=${VERSION} | tee /tmp/node_feature_discovery.yaml >/dev/null
+    kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd/overlays/node-feature-rules?ref=${VERSION} | tee /tmp/node_feature_rules.yaml >/dev/null
+    kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/gpu_plugin/overlays/nfd_labeled_nodes?ref=${VERSION} | tee /tmp/gpu_plugin.yaml >/dev/null
+    sed -i 's/enable-monitoring/enable-monitoring\n        - -shared-dev-num=10/' /tmp/gpu_plugin.yaml
+    kubectl apply -f /tmp/node_feature_discovery.yaml
+    kubectl apply -f /tmp/node_feature_rules.yaml
+    kubectl apply -f /tmp/gpu_plugin.yaml
+    SLEEP_SECS=15
+    echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking rollout status."
+    sleep ${SLEEP_SECS}
+    kubectl -n node-feature-discovery rollout status ds/nfd-worker
+    kubectl -n default rollout status ds/intel-gpu-plugin
+    echo "[INFO]: sleeping for ${SLEEP_SECS} seconds to allow pod status to update for subsequent tests."
+    sleep ${SLEEP_SECS}
+    echo "Test success: Intel K8s GPU Device Plugin deployed."
+}
+
+check_intel_gpu_plugin_daemonset_is_deployed() {
+    result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].metadata.name}')
+    if [ "${result}" = "intel-gpu-plugin" ]; then
+        echo "Test success: 'intel-gpu-plugin' daemonset is deployed!"
+    else
+        >&2 echo "Test failure: expected daemonset name 'intel-gpu-plugin' but got ${result}"
+        exit 1
+    fi
+}
+
+check_one_intel_gpu_plugin_daemonset_is_available() {
+    result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].status.numberAvailable}')
+    if [ "${result}" = "1" ]; then
+        echo "Test success: 1 daemonset in numberAvailable status."
+    else
+        >&2 echo "Test failure: expected numberAvailable to be 1 but got ${result}"
+        exit 1
+    fi
+}
+
+check_one_intel_gpu_plugin_daemonset_is_ready() {
+    result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].status.numberReady}')
+    if [ "${result}" = "1" ]; then
+        echo "Test success: 1 daemonset in numberReady status."
+    else
+        >&2 echo "Test failure: expected numberReady to be 1 but got ${result}"
+        exit 1
+    fi
+}
+
+check_intel_gpu_node_label_is_attached() {
+    result=$(microk8s.kubectl get node -o jsonpath='{.items[0].metadata.labels.intel\.feature\.node\.kubernetes\.io/gpu}')
+    if [ "${result}" = "true" ]; then
+        echo "Test success: found expected label: 'intel.feature.node.kubernetes.io/gpu': 'true'"
+    else
+        >&2 echo "Test failure: expected 'true' but got ${result}"
+        exit 1
+    fi
+}
+
+check_at_least_one_intel_gpu_is_available() {
+    result=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}')
+    if [ "${result}" -ge 1 ]; then
+        echo "Test success: Found ${result} GPUs on system."
+    else
+        >&2 echo "Test failure: expected at least 1 GPU but got ${result}"
+        exit 1
+    fi
+}
+
+check_capacity_slots_for_intel_gpus_match() {
+    num_gpus=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}')
+    result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.capacity.gpu\.intel\.com/i915}')
+    # IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation
+    SLOTS_PER_GPU=10
+    total_slots=$((num_gpus * SLOTS_PER_GPU))
+    if [ "${total_slots}" -eq "${result}" ]; then
+        echo "Test success: Found ${result} GPU capacity slots on k8s node."
+    else
+        >&2 echo "Test failure: expected ${total_slots} GPU capacity slots but got ${result}"
+        exit 1
+    fi
+}
+
+check_allocatable_slots_for_intel_gpus_match() {
+    num_gpus=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}')
+    result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.allocatable.gpu\.intel\.com/i915}')
+    # IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation
+    SLOTS_PER_GPU=10
+    total_slots=$((num_gpus * SLOTS_PER_GPU))
+    if [ "${total_slots}" -eq "${result}" ]; then
+        echo "Test success: Found ${result} GPU allocatable slots on k8s node."
+    else
+        >&2 echo "Test failure: expected ${total_slots} GPU allocatable slots but got ${result}"
+        exit 1
+    fi
+}
+
+help_function() {
+    echo "This script is used for tests related to Intel GPUs"
+    echo "Usage: check.sh <test_case>"
+    echo
+    echo "Test cases currently implemented:"
+    echo -e "\t<host_has_intel_gpus>: check_host_has_intel_gpus"
+    echo -e "\t<gpu_plugin_can_be_installed>: check_intel_gpu_plugin_can_be_installed"
+    echo -e "\t<gpu_plugin_daemonset_is_deployed>: check_intel_gpu_plugin_daemonset_is_deployed"
+    echo -e "\t<one_daemonset_is_available>: check_one_intel_gpu_plugin_daemonset_is_available"
+    echo -e "\t<one_daemonset_is_ready>: check_one_intel_gpu_plugin_daemonset_is_ready"
+    echo -e "\t<gpu_node_label_is_attached>: check_intel_gpu_node_label_is_attached"
+    echo -e "\t<at_least_one_gpu_is_available>: check_at_least_one_intel_gpu_is_available"
+    echo -e "\t<capacity_slots_for_gpus_match>: check_capacity_slots_for_intel_gpus_match"
+    echo -e "\t<allocatable_slots_for_gpus_match>: check_allocatable_slots_for_intel_gpus_match"
+}
+
+main() {
+    case ${1} in
+    host_has_intel_gpus) check_host_has_intel_gpus ;;
+    gpu_plugin_can_be_installed) check_intel_gpu_plugin_can_be_installed ;;
+    gpu_plugin_daemonset_is_deployed) check_intel_gpu_plugin_daemonset_is_deployed ;;
+    one_daemonset_is_available) check_one_intel_gpu_plugin_daemonset_is_available ;;
+    one_daemonset_is_ready) check_one_intel_gpu_plugin_daemonset_is_ready ;;
+    gpu_node_label_is_attached) check_intel_gpu_node_label_is_attached ;;
+    at_least_one_gpu_is_available) check_at_least_one_intel_gpu_is_available ;;
+    capacity_slots_for_gpus_match) check_capacity_slots_for_intel_gpus_match ;;
+    allocatable_slots_for_gpus_match) check_allocatable_slots_for_intel_gpus_match ;;
+    *) help_function ;;
+    esac
+}
+
+main "$@"
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+check_ipex_can_be_imported() {
+    echo "Starting ipex import test"
+    script="import intel_extension_for_pytorch as ipex; import torch; import jupyter"
+    if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then
+        echo "PASS: Found module"
+        exit 0
+    else
+        >&2 echo "FAIL: Did not find IPEX python module"
+        exit 1
+    fi
+}
+
+check_pytorch_can_use_xpu() {
+    echo "Starting ipex GPU check test"
+    script="$(cat "$SCRIPT_DIR/pytorch_can_use_xpu.py")"
+    gpu_grep_out=$(microk8s.kubectl -n dss exec "$1" -- python3 -c "$script" | grep "dev_type=.gpu" 2>&1)
+    if [[ -z ${gpu_grep_out} ]]; then
+        >&2 echo "FAIL: No GPU found"
+        exit 1
+    else
+        echo "PASS: GPU found"
+        exit 0
+    fi
+}
+
+help_function() {
+    echo "This script is used for tests related to IPEX"
+    echo "Usage: check_dss.sh <test_case>"
+    echo
+    echo "Test cases currently implemented:"
+    echo -e "\t<can_be_imported>: check_itex_can_be_imported"
+    echo -e "\t<pytorch_can_use_xpu>: check_pytorch_can_use_xpu"
+}
+
+main() {
+    pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'ipex-2120-notebook\S*')
+    echo "Found PyTorch pod: ${pod}"
+    case ${1} in
+    can_be_imported) check_ipex_can_be_imported "$pod" ;;
+    pytorch_can_use_xpu) check_pytorch_can_use_xpu "$pod" ;;
+    *) help_function ;;
+    esac
+}
+
+main "$@"
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+check_itex_can_be_imported() {
+    echo "Starting itex import test"
+    script="import intel_extension_for_tensorflow as itex; import tensorflow; import jupyter"
+    if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then
+        echo "PASS: Found module"
+        exit 0
+    else
+        >&2 echo "FAIL: Did not find ITEX python module"
+        exit 1
+    fi
+}
+
+check_tensorflow_can_use_xpu() {
+    echo "Starting itex GPU check test"
+    script="$(cat "$SCRIPT_DIR/tensorflow_can_use_xpu.py")"
+    if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then
+        echo "PASS: XPU found"
+        exit 0
+    else
+        >&2 echo "FAIL: No XPU found"
+        exit 1
+    fi
+}
+
+help_function() {
+    echo "This script is used for tests related to ITEX"
+    echo "Usage: check_dss.sh <test_case>"
+    echo
+    echo "Test cases currently implemented:"
+    echo -e "\t<can_be_imported>: check_itex_can_be_imported"
+    echo -e "\t<tensorflow_can_use_xpu>: check_tensorflow_can_use_xpu"
+}
+
+main() {
+    pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'itex-215-notebook\S*')
+    echo "Found Tensorflow pod: ${pod}"
+    case ${1} in
+    can_be_imported) check_itex_can_be_imported "$pod" ;;
+    tensorflow_can_use_xpu) check_tensorflow_can_use_xpu "$pod" ;;
+    *) help_function ;;
+    esac
+}
+
+main "$@"