Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor DSS provider tests (New) #1601

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
4 changes: 2 additions & 2 deletions .github/workflows/testflinger-contrib-dss-regression.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
-e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \
-e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \
-e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \
${GITHUB_WORKSPACE}/testflinger/job-def.yaml > \
${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \
${GITHUB_WORKSPACE}/job.yaml
- name: Build job file from template with oemscript provisioning
if: ${{ matrix.queue == 'dell-precision-5680-c31665' }}
Expand All @@ -51,7 +51,7 @@ jobs:
-e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \
-e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \
-e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \
${GITHUB_WORKSPACE}/testflinger/job-def.yaml > \
${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \
${GITHUB_WORKSPACE}/job.yaml
- name: Submit testflinger job
uses: canonical/testflinger/.github/actions/submit@main
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env bash

set -euxo pipefail

# IMPORTANT: for any test using the dss command:
#
# - Clear PYTHON shell vars to prevent conflicts between dss
# and checkbox python environments
# - Run from ${HOME} as dss writes logs to its working directory,
# and as a snap does not have permissions to write to the default
# working directory for checkbox tests
export -n PYTHONHOME PYTHONPATH PYTHONUSERBASE

check_dss_can_be_initialized() {
# TODO: we actually seem to initialize dss here; maybe split it out
cd "${HOME}"
dss initialize --kubeconfig="$(sudo microk8s config)"
echo "Test success: dss initialized."
}

check_dss_namespace_is_deployed() {
if microk8s.kubectl get ns | grep -q dss; then
echo "Test success: 'dss' namespace is deployed!"
else
>&2 echo "Test failure: no namespace named 'dss' deployed."
exit 1
fi
}

check_mlflow_status_is_ready() {
cd "${HOME}"
result=$(dss status) # save result to shell var to avoid broken pipe error
if echo "${result}" | grep -q "MLflow deployment: Ready"; then
echo "Test success: 'dss status' shows ready status for mlflow."
else
>&2 echo "Test failure: 'dss status' does not show ready status for mlflow."
exit 1
fi
}

check_mlflow_is_deployed_as_first_service() {
# TODO: enable mlflow to be a service in any position
result=$(microk8s.kubectl get service -n dss -o jsonpath='{.items[0].metadata.name}')
if [ "${result}" = "mlflow" ]; then
echo "Test success: 'mlflow' service is deployed!"
else
>&2 echo "Test failure: expected service name 'mlflow' but got ${result}"
exit 1
fi
}

check_dss_has_intel_gpu_acceleration_enabled() {
cd "${HOME}"
result=$(dss status) # save result to shell var to avoid broken pipe error
if echo "${result}" | grep -q "Intel GPU acceleration: Enabled"; then
echo "Test success: 'dss status' correctly reports Intel GPU status."
else
>&2 echo "Test failure: 'dss status' does not report that Intel GPU acceleration is enabled."
exit 1
fi
}

check_dss_can_create_itex_215_notebook() {
cd "${HOME}"
if dss create itex-215-notebook --image=intel/intel-extension-for-tensorflow:2.15.0-xpu-idp-jupyter; then
echo "Test success: successfully created an ITEX 2.15 notebook."
else
>&2 echo "Test failure: failed to create an ITEX 2.15 notebook."
exit 1
fi
}

check_dss_can_create_ipex_2120_notebook() {
cd "${HOME}"
if dss create ipex-2120-notebook --image=intel/intel-extension-for-pytorch:2.1.20-xpu-idp-jupyter; then
echo "Test success: successfully created an IPEX 2.1.20 notebook."
else
>&2 echo "Test failure: failed to create an IPEX 2.1.20 notebook."
exit 1
fi
}

help_function() {
echo "This script is used for generic tests related to DSS"
echo "Usage: check_dss.sh <test_case>"
echo
echo "Test cases currently implemented:"
echo -e "\t<dss_can_be_initialized>: check_dss_can_be_initialized"
echo -e "\t<dss_namespace_is_deployed>: check_dss_namespace_is_deployed"
echo -e "\t<mlflow_status_is_ready>: check_mlflow_status_is_ready"
echo -e "\t<mlflow_is_deployed_as_first_service>: check_mlflow_is_deployed_as_first_service"
echo -e "\t<intel_gpu_acceleration_is_enabled>: check_dss_has_intel_gpu_acceleration_enabled"
echo -e "\t<can_create_itex_215_notebook>: check_dss_can_create_itex_215_notebook"
echo -e "\t<can_create_ipex_2120_notebook>: check_dss_can_create_ipex_2120_notebook"
}

main() {
case ${1} in
dss_can_be_initialized) check_dss_can_be_initialized ;;
dss_namespace_is_deployed) check_dss_namespace_is_deployed ;;
mlflow_status_is_ready) check_mlflow_status_is_ready ;;
mlflow_is_deployed_as_first_service) check_mlflow_is_deployed_as_first_service ;;
intel_gpu_acceleration_is_enabled) check_dss_has_intel_gpu_acceleration_enabled ;;
can_create_itex_215_notebook) check_dss_can_create_itex_215_notebook ;;
can_create_ipex_2120_notebook) check_dss_can_create_ipex_2120_notebook ;;
*) help_function ;;
esac
}

main "$@"
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#!/usr/bin/env bash

set -euxo pipefail

check_host_has_intel_gpus() {
result=$(intel_gpu_top -L)
if [[ ${result} == *"pci:vendor=8086"* ]]; then
echo "Test success: Intel GPU available on host: ${result}"
else
>&2 echo "Test failure: "intel_gpu_top -L" reports no Intel GPUs: ${result}"
exit 1
fi
}

check_intel_gpu_plugin_can_be_installed() {
# Using kubectl directly due to this bug: https://github.com/canonical/microk8s/issues/4453

# TODO: make version a param
VERSION=v0.30.0
# hack as redirecting stdout anywhere but /dev/null throws a permission denied error
# see: https://forum.snapcraft.io/t/eksctl-cannot-write-to-stdout/17254/4
kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd?ref=${VERSION} | tee /tmp/node_feature_discovery.yaml >/dev/null
kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd/overlays/node-feature-rules?ref=${VERSION} | tee /tmp/node_feature_rules.yaml >/dev/null
kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/gpu_plugin/overlays/nfd_labeled_nodes?ref=${VERSION} | tee /tmp/gpu_plugin.yaml >/dev/null
sed -i 's/enable-monitoring/enable-monitoring\n - -shared-dev-num=10/' /tmp/gpu_plugin.yaml
kubectl apply -f /tmp/node_feature_discovery.yaml
kubectl apply -f /tmp/node_feature_rules.yaml
kubectl apply -f /tmp/gpu_plugin.yaml
SLEEP_SECS=15
echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking rollout status."
sleep ${SLEEP_SECS}
kubectl -n node-feature-discovery rollout status ds/nfd-worker
kubectl -n default rollout status ds/intel-gpu-plugin
echo "[INFO]: sleeping for ${SLEEP_SECS} seconds to allow pod status to update for subsequent tests."
sleep ${SLEEP_SECS}
echo "Test success: Intel K8s GPU Device Plugin deployed."
}

check_intel_gpu_plugin_daemonset_is_deployed() {
result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].metadata.name}')
if [ "${result}" = "intel-gpu-plugin" ]; then
echo "Test success: 'intel-gpu-plugin' daemonset is deployed!"
else
>&2 echo "Test failure: expected daemonset name 'intel-gpu-plugin' but got ${result}"
exit 1
fi
}

check_one_intel_gpu_plugin_daemonset_is_available() {
result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].status.numberAvailable}')
if [ "${result}" = "1" ]; then
echo "Test success: 1 daemonset in numberAvailable status."
else
>&2 echo "Test failure: expected numberAvailable to be 1 but got ${result}"
exit 1
fi
}

check_one_intel_gpu_plugin_daemonset_is_ready() {
result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].status.numberReady}')
if [ "${result}" = "1" ]; then
echo "Test success: 1 daemonset in numberReady status."
else
>&2 echo "Test failure: expected numberReady to be 1 but got ${result}"
exit 1
fi
}

check_intel_gpu_node_label_is_attached() {
result=$(microk8s.kubectl get node -o jsonpath='{.items[0].metadata.labels.intel\.feature\.node\.kubernetes\.io/gpu}')
if [ "${result}" = "true" ]; then
echo "Test success: found expected label: 'intel.feature.node.kubernetes.io/gpu': 'true'"
else
>&2 echo "Test failure: expected 'true' but got ${result}"
exit 1
fi
}

check_at_least_one_intel_gpu_is_available() {
result=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}')
if [ "${result}" -ge 1 ]; then
echo "Test success: Found ${result} GPUs on system."
else
>&2 echo "Test failure: expected at least 1 GPU but got ${result}"
exit 1
fi
}

check_capacity_slots_for_intel_gpus_match() {
num_gpus=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}')
result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.capacity.gpu\.intel\.com/i915}')
# IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation
SLOTS_PER_GPU=10
total_slots=$((num_gpus * SLOTS_PER_GPU))
if [ "${total_slots}" -eq "${result}" ]; then
echo "Test success: Found ${result} GPU capacity slots on k8s node."
else
>&2 echo "Test failure: expected ${total_slots} GPU capacity slots but got ${result}"
exit 1
fi
}

check_allocatable_slots_for_intel_gpus_match() {
num_gpus=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}')
result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.allocatable.gpu\.intel\.com/i915}')
# IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation
SLOTS_PER_GPU=10
total_slots=$((num_gpus * SLOTS_PER_GPU))
if [ "${total_slots}" -eq "${result}" ]; then
echo "Test success: Found ${result} GPU allocatable slots on k8s node."
else
>&2 echo "Test failure: expected ${total_slots} GPU allocatable slots but got ${result}"
exit 1
fi
}

help_function() {
echo "This script is used for tests related to Intel GPUs"
echo "Usage: check.sh <test_case>"
echo
echo "Test cases currently implemented:"
echo -e "\t<host_has_intel_gpus>: check_host_has_intel_gpus"
echo -e "\t<gpu_plugin_can_be_installed>: check_intel_gpu_plugin_can_be_installed"
echo -e "\t<gpu_plugin_daemonset_is_deployed>: check_intel_gpu_plugin_daemonset_is_deployed"
echo -e "\t<one_daemonset_is_available>: check_one_intel_gpu_plugin_daemonset_is_available"
echo -e "\t<one_daemonset_is_ready>: check_one_intel_gpu_plugin_daemonset_is_ready"
echo -e "\t<gpu_node_label_is_attached>: check_intel_gpu_node_label_is_attached"
echo -e "\t<at_least_one_gpu_is_available>: check_at_least_one_intel_gpu_is_available"
echo -e "\t<capacity_slots_for_gpus_match>: check_capacity_slots_for_intel_gpus_match"
echo -e "\t<allocatable_slots_for_gpus_match>: check_allocatable_slots_for_intel_gpus_match"
}

main() {
case ${1} in
host_has_intel_gpus) check_host_has_intel_gpus ;;
gpu_plugin_can_be_installed) check_intel_gpu_plugin_can_be_installed ;;
gpu_plugin_daemonset_is_deployed) check_intel_gpu_plugin_daemonset_is_deployed ;;
one_daemonset_is_available) check_one_intel_gpu_plugin_daemonset_is_available ;;
one_daemonset_is_ready) check_one_intel_gpu_plugin_daemonset_is_ready ;;
gpu_node_label_is_attached) check_intel_gpu_node_label_is_attached ;;
at_least_one_gpu_is_available) check_at_least_one_intel_gpu_is_available ;;
capacity_slots_for_gpus_match) check_capacity_slots_for_intel_gpus_match ;;
allocatable_slots_for_gpus_match) check_allocatable_slots_for_intel_gpus_match ;;
*) help_function ;;
esac
}

main "$@"
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env bash

set -euxo pipefail

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

check_ipex_can_be_imported() {
echo "Starting ipex import test"
script="import intel_extension_for_pytorch as ipex; import torch; import jupyter"
if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then
echo "PASS: Found module"
exit 0
else
>&2 echo "FAIL: Did not find IPEX python module"
exit 1
fi
}

check_pytorch_can_use_xpu() {
echo "Starting ipex GPU check test"
script="$(cat "$SCRIPT_DIR/pytorch_can_use_xpu.py")"
gpu_grep_out=$(microk8s.kubectl -n dss exec "$1" -- python3 -c "$script" | grep "dev_type=.gpu" 2>&1)
if [[ -z ${gpu_grep_out} ]]; then
>&2 echo "FAIL: No GPU found"
exit 1
else
echo "PASS: GPU found"
exit 0
fi
}

help_function() {
echo "This script is used for tests related to IPEX"
echo "Usage: check_dss.sh <test_case>"
echo
echo "Test cases currently implemented:"
echo -e "\t<can_be_imported>: check_itex_can_be_imported"
echo -e "\t<pytorch_can_use_xpu>: check_pytorch_can_use_xpu"
}

main() {
pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'ipex-2120-notebook\S*')
echo "Found PyTorch pod: ${pod}"
case ${1} in
can_be_imported) check_ipex_can_be_imported "$pod" ;;
pytorch_can_use_xpu) check_pytorch_can_use_xpu "$pod" ;;
*) help_function ;;
esac
}

main "$@"
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env bash

set -euxo pipefail

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

check_itex_can_be_imported() {
echo "Starting itex import test"
script="import intel_extension_for_tensorflow as itex; import tensorflow; import jupyter"
if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then
echo "PASS: Found module"
exit 0
else
>&2 echo "FAIL: Did not find ITEX python module"
exit 1
fi
}

check_tensorflow_can_use_xpu() {
echo "Starting itex GPU check test"
script="$(cat "$SCRIPT_DIR/tensorflow_can_use_xpu.py")"
if microk8s.kubectl -n dss exec "$1" -- python3 -c "$script"; then
echo "PASS: XPU found"
exit 0
else
>&2 echo "FAIL: No XPU found"
exit 1
fi
}

help_function() {
echo "This script is used for tests related to ITEX"
echo "Usage: check_dss.sh <test_case>"
echo
echo "Test cases currently implemented:"
echo -e "\t<can_be_imported>: check_itex_can_be_imported"
echo -e "\t<tensorflow_can_use_xpu>: check_tensorflow_can_use_xpu"
}

main() {
pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'itex-215-notebook\S*')
echo "Found Tensorflow pod: ${pod}"
case ${1} in
can_be_imported) check_itex_can_be_imported "$pod" ;;
tensorflow_can_use_xpu) check_tensorflow_can_use_xpu "$pod" ;;
*) help_function ;;
esac
}

main "$@"
Loading
Loading