Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include GPU check validation #20840

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 43 additions & 17 deletions tests/publiccloud/create_aistack_env.pm
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SUSE's openQA tests
)# SUSE's openQA tests
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A tidy before push always useful: remove this leftover before next commit and VR.

#
# Copyright 2024 SUSE LLC
# SPDX-License-Identifier: FSFAP
Expand Down Expand Up @@ -26,6 +26,7 @@ use publiccloud::utils;
use transactional qw(process_reboot trup_install trup_shell);
use File::Basename;
use version_utils;
use utils 'script_retry';
use Data::Dumper;

sub test_flags {
Expand Down Expand Up @@ -159,10 +160,16 @@ sub install_aistack_chart {
# After reaching max_retries , record the pod details which does not run after reaching max_retries
my $max_retries = 15;
my @failed_pods;
my @issue_logs_pod;
my $sleep_interval = 20;
my $ollama_pod;
my @out = split(' ', script_output("kubectl get pods --namespace $namespace -o custom-columns=':metadata.name'"));
record_info("Pod names", join(" ", @out));
POD_LOOP: foreach my $pod (@out) {

if ($pod =~ /^ollama/) {
$ollama_pod = $pod;
yarunachalam marked this conversation as resolved.
Show resolved Hide resolved
}
my $counter = 0;
my $start_time = time();
while ($counter++ < $max_retries) {
Expand All @@ -177,8 +184,8 @@ sub install_aistack_chart {
next POD_LOOP;
} else {
if ($logs =~ /ERROR|FAILURE|Exception|Failed/) {
record_info("$pod failed due to error in log: $logs \n ");
push @failed_pods, {name => $pod, status => $status};
record_info("$pod has error in log: $logs \n ");
push @issue_logs_pod, {name => $pod, status => $status};
next POD_LOOP;
} # if log
sleep $sleep_interval;
Expand All @@ -188,6 +195,25 @@ sub install_aistack_chart {
} # pod loop
yarunachalam marked this conversation as resolved.
Show resolved Hide resolved

assert_script_run("kubectl get all --namespace $namespace");

# GPU check for NVIDIA_GPU_AISTACK test
if (check_var('PUBLIC_CLOUD_NVIDIA_GPU_AISTACK', 1)) {
my $ollama_log = script_output("kubectl logs $ollama_pod -n $namespace", proceed_on_failure => 1);
if ($ollama_log =~ /looking for compatible GPUs/) {
record_info("GPU compatible check in pod log $ollama_pod.");
}
if ($ollama_log =~ /no gpus found/) {
die "No GPU found for $ollama_pod\n";
}
}

# pod logs containing ERROR, FAILURE, or Exception, log a message indicating
# that the log has failure details and further inspection is needed
if (@issue_logs_pod) {
record_info("@issue_logs_pod log has ERROR|FAILURE|Exception check log for more details ");
}

# Exit if there is failed pods
if (@failed_pods) {
die "Failed pods:\n" . join("\n", map { "$_->{name}: $_->{status}" } @failed_pods) . "\n";
}
Expand Down Expand Up @@ -233,23 +259,20 @@ sub test_openwebui_service {
record_info("Added $ipaddr to /etc/hosts with hostname $host_name");

# get endpoints
assert_script_run("kubectl get endpoints $sr_name -n $namespace -o=jsonpath='{.subsets[*].addresses[*].ip}'");
my $endpoint_cmd = "kubectl get endpoints $sr_name -n $namespace -o=jsonpath='{.subsets[*].addresses[*].ip}'";
my $endpoint_result = script_output($endpoint_cmd);
my $endpoint_result = script_output("kubectl get endpoints $sr_name -n $namespace -o=jsonpath='{.subsets[*].addresses[*].ip}'");
record_info("Endpoint code: $endpoint_result \n");
if (!$endpoint_result) {
die "No healthy endpoints found for the open-webui service in $namespace\n";
die "No healthy endpoints found for the open-webui service in $namespace\n";
} else {
# connect open-webui service
assert_script_run("curl --output /dev/null --silent --head --write-out \"%{http_code}\n\" -k -L https://$host_name");
my $curl_cmd = "curl --output /dev/null --silent --head --write-out \"%{http_code}\n\" -k -L https://$host_name";
my $curl_result = script_output($curl_cmd);
record_info("http code: $curl_result \n");
if ($curl_result == 200) {
record_info("Successfully connected to the open-webui service at $curl_cmd \n");
} else {
die "Received unexpected HTTP error code $curl_result for $curl_cmd\n";
}
# connect open-webui service
my $curl_cmd = 'curl -v --trace --output /dev/null --silent --head --write-out "%{http_code}\n" -k -L https://' . $host_name;
my $curl_code = script_retry($curl_cmd, retry => 5, delay => 60);
record_info("http code: $curl_code \n");
if ($curl_code == 200) {
record_info("Successfully connected to the open-webui service at $curl_cmd \n");
} else {
die "Received unexpected HTTP error code $curl_result for $curl_cmd\n";
}
}

# create Admin user
Expand Down Expand Up @@ -304,6 +327,8 @@ sub run {

my $instance = $self->{my_instance} = $args->{my_instance};
my $provider = $self->{provider} = $args->{my_provider};
my $webip = $instance->public_ip;
record_info 'Instance', join(' ', 'IP: ', $instance->public_ip);

# Install dependency package, config kubectl and depnedency components
install_dependency_package($instance);
Expand All @@ -317,6 +342,7 @@ sub run {

# Install private_ai_stack chart
install_aistack_chart($instance, $ai_ns);


# OpenWebUI service test
test_openwebui_service($instance, $ai_ns);
Expand Down
Loading