Skip to content

Commit

Permalink
Re-enabled all components on failure scenarios
Browse files Browse the repository at this point in the history
Signed-off-by: Shiva Krishna, Merla <[email protected]>
  • Loading branch information
shivamerla committed May 20, 2021
1 parent 90633ed commit 09d6ad7
Showing 1 changed file with 18 additions and 7 deletions.
25 changes: 18 additions & 7 deletions deployments/gpu-operator/reconfigure-mig.sh
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,22 @@ function exit_success() {
__set_state_and_exit "success" 0
}

function exit_failed_no_restart_gpu_clients() {
__set_state_and_exit "failed" 1
}

function exit_failed() {
echo "Restarting all GPU clients previouly shutdown by reenabling their component-specific nodeSelector labels"
kubectl label --overwrite \
node ${NODE_NAME} \
nvidia.com/gpu.deploy.device-plugin=true \
nvidia.com/gpu.deploy.gpu-feature-discovery=true \
nvidia.com/gpu.deploy.dcgm-exporter=true
if [ "${?}" != "0" ]; then
echo "Unable to bring up GPU operator components by setting their daemonset labels"
fi
__set_state_and_exit "failed" 1
}
}

echo "Asserting that the requested configuration is present in the configuration file"
nvidia-mig-parted assert --valid-config -f ${MIG_CONFIG_FILE} -c ${SELECTED_MIG_CONFIG}
Expand Down Expand Up @@ -172,7 +185,9 @@ fi

echo "Applying the selected MIG config to the node"
nvidia-mig-parted -d apply -f ${MIG_CONFIG_FILE} -c ${SELECTED_MIG_CONFIG}
apply_exit_code="${?}"
if [ "${?}" != "0" ]; then
exit_failed
fi

echo "Restarting all GPU clients previouly shutdown by reenabling their component-specific nodeSelector labels"
kubectl label --overwrite \
Expand All @@ -182,11 +197,7 @@ kubectl label --overwrite \
nvidia.com/gpu.deploy.dcgm-exporter=true
if [ "${?}" != "0" ]; then
echo "Unable to bring up GPU operator components by setting their daemonset labels"
exit_failed
fi

if [ "${apply_exit_code}" != "0" ]; then
exit_failed
exit_failed_no_restart_gpu_clients
fi

echo "Restarting validator pod to re-run all validations"
Expand Down

0 comments on commit 09d6ad7

Please sign in to comment.