Skip to content

Commit

Permalink
Include GPU check validation
Browse files Browse the repository at this point in the history
Do not exist for log issues only pod failure.
  • Loading branch information
yarunachalam committed Jan 14, 2025
1 parent 19505d3 commit d924fcf
Showing 1 changed file with 44 additions and 16 deletions.
60 changes: 44 additions & 16 deletions tests/publiccloud/create_aistack_env.pm
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SUSE's openQA tests
)# SUSE's openQA tests
#
# Copyright 2024 SUSE LLC
# SPDX-License-Identifier: FSFAP
Expand Down Expand Up @@ -159,10 +159,16 @@ sub install_aistack_chart {
# After reaching max_retries , record the pod details which does not run after reaching max_retries
my $max_retries = 15;
my @failed_pods;
my @issue_logs_pod;
my $sleep_interval = 20;
my $ollama_pod;
my @out = split(' ', script_output("kubectl get pods --namespace $namespace -o custom-columns=':metadata.name'"));
record_info("Pod names", join(" ", @out));
POD_LOOP: foreach my $pod (@out) {

if ($pod =~ /^ollama/) {
$ollama_pod = $pod;
}
my $counter = 0;
my $start_time = time();
while ($counter++ < $max_retries) {
Expand All @@ -177,8 +183,8 @@ sub install_aistack_chart {
next POD_LOOP;
} else {
if ($logs =~ /ERROR|FAILURE|Exception|Failed/) {
record_info("$pod failed due to error in log: $logs \n ");
push @failed_pods, {name => $pod, status => $status};
record_info("$pod has error in log: $logs \n ");
push @issue_logs_pod, {name => $pod, status => $status};
next POD_LOOP;
} # if log
sleep $sleep_interval;
Expand All @@ -188,6 +194,25 @@ sub install_aistack_chart {
} # pod loop

assert_script_run("kubectl get all --namespace $namespace");

# GPU check for NVIDIA_GPU_AISTACK test
if (check_var('PUBLIC_CLOUD_NVIDIA_GPU_AISTACK', 1)) {
my $ollama_log = script_output("kubectl logs $ollama_pod -n $namespace", proceed_on_failure => 1);
if ($ollama_log =~ /looking for compatible GPUs/) {
record_info("GPU compatible check in pod log $ollama_pod.");
}
if ($ollama_log =~ /no gpus found/) {
die "No GPU found for $ollama_pod\n";
}
}

# pod logs containing ERROR, FAILURE, or Exception, log a message indicating
# that the log has failure details and further inspection is needed
if (@issue_logs_pod) {
record_info("@issue_logs_pod log has ERROR|FAILURE|Exception check log for more details ");
}

# Exit if there is failed pods
if (@failed_pods) {
die "Failed pods:\n" . join("\n", map { "$_->{name}: $_->{status}" } @failed_pods) . "\n";
}
Expand Down Expand Up @@ -234,22 +259,22 @@ sub test_openwebui_service {

# get endpoints
assert_script_run("kubectl get endpoints $sr_name -n $namespace -o=jsonpath='{.subsets[*].addresses[*].ip}'");
my $endpoint_cmd = "kubectl get endpoints $sr_name -n $namespace -o=jsonpath='{.subsets[*].addresses[*].ip}'";
my $endpoint_result = script_output($endpoint_cmd);
my $endpoint_result = script_output("kubectl get endpoints $sr_name -n $namespace -o=jsonpath='{.subsets[*].addresses[*].ip}'");
record_info("Endpoint code: $endpoint_result \n");
if (!$endpoint_result) {
die "No healthy endpoints found for the open-webui service in $namespace\n";
die "No healthy endpoints found for the open-webui service in $namespace\n";
} else {
# connect open-webui service
assert_script_run("curl --output /dev/null --silent --head --write-out \"%{http_code}\n\" -k -L https://$host_name");
my $curl_cmd = "curl --output /dev/null --silent --head --write-out \"%{http_code}\n\" -k -L https://$host_name";
my $curl_result = script_output($curl_cmd);
record_info("http code: $curl_result \n");
if ($curl_result == 200) {
record_info("Successfully connected to the open-webui service at $curl_cmd \n");
} else {
die "Received unexpected HTTP error code $curl_result for $curl_cmd\n";
}
# connect open-webui service
my $curl_code = script_output('curl -v --trace --output /dev/null --silent --head --write-out "%{http_code}\n" -k -L https://' . $host_name);
my $curl_cmd_test = 'test $(' .
'curl -v --trace --output /dev/null --silent --head --write-out "%{http_code}\n" -k -L https://' . $host_name') -eq 200';
record_info("http code: $curl_result \n");
$curl_code = $script_retry($curl_cmd_test, retry => 5, delay => 60);
if ($curl_code == 200) {
record_info("Successfully connected to the open-webui service at $curl_cmd \n");
} else {
die "Received unexpected HTTP error code $curl_result for $curl_cmd\n";
}
}

# create Admin user
Expand Down Expand Up @@ -304,6 +329,8 @@ sub run {

my $instance = $self->{my_instance} = $args->{my_instance};
my $provider = $self->{provider} = $args->{my_provider};
my $webip = $instance->public_ip;
record_info 'Instance', join(' ', 'IP: ', $instance->public_ip);

# Install dependency package, config kubectl and depnedency components
install_dependency_package($instance);
Expand All @@ -317,6 +344,7 @@ sub run {

# Install private_ai_stack chart
install_aistack_chart($instance, $ai_ns);


# OpenWebUI service test
test_openwebui_service($instance, $ai_ns);
Expand Down

0 comments on commit d924fcf

Please sign in to comment.