From 227d35c39789c44ccb8a02554876e4d2864fe853 Mon Sep 17 00:00:00 2001 From: yarunachalam Date: Wed, 18 Dec 2024 10:20:02 -0800 Subject: [PATCH] Include GPU check validation Do not exist for log issues only pod failure. --- tests/publiccloud/create_aistack_env.pm | 80 ++++++++++++++++--------- 1 file changed, 53 insertions(+), 27 deletions(-) diff --git a/tests/publiccloud/create_aistack_env.pm b/tests/publiccloud/create_aistack_env.pm index af1656e6bf33..e41482c85f13 100644 --- a/tests/publiccloud/create_aistack_env.pm +++ b/tests/publiccloud/create_aistack_env.pm @@ -74,9 +74,9 @@ sub install_dependency_components { my $ing_ver = get_var('ING_VERSION'); # Add Ingress Controller to open-webui endpoint - assert_script_run("helm repo add $ingress_repo"); - assert_script_run("helm repo update"); - assert_script_run("helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx --namespace ingress-nginx --set controller.service.type=ClusterIP --version $ing_ver --create-namespace", timeout => 120); + #assert_script_run("helm repo add $ingress_repo"); + #assert_script_run("helm repo update"); +#assert_script_run("helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx --namespace ingress-nginx --set controller.service.type=ClusterIP --version $ing_ver --create-namespace", timeout => 120); # Add cert-manager repo,install assert_script_run("helm repo add $cert_repo"); @@ -143,10 +143,10 @@ sub install_aistack_chart { assert_script_run("helm registry login dp.apps.rancher.io/charts -u $docker_user_name -p $SECRET_application_collection"); if (check_var('PUBLIC_CLOUD_NVIDIA_GPU_AISTACK', 1)) { assert_script_run("curl " . data_url("aistack/$openwebui_gpu_values") . " -o $openwebui_gpu_values", 60); - assert_script_run("helm install open-webui -f $openwebui_gpu_values -n $namespace $openwebui_helm_repo --set open-webui.ingress.class=nginx", timeout => 100); + assert_script_run("helm install open-webui -f $openwebui_gpu_values -n $namespace $openwebui_helm_repo", timeout => 100); } else { assert_script_run("curl " . data_url("aistack/$openwebui_values") . " -o $openwebui_values", 60); - assert_script_run("helm install open-webui -f $openwebui_values -n $namespace $openwebui_helm_repo --set open-webui.ingress.class=nginx", timeout => 100); + assert_script_run("helm install open-webui -f $openwebui_values -n $namespace $openwebui_helm_repo", timeout => 100); } assert_script_run("kubectl get all --namespace $namespace"); @@ -159,10 +159,16 @@ sub install_aistack_chart { # After reaching max_retries , record the pod details which does not run after reaching max_retries my $max_retries = 15; my @failed_pods; + my @issue_logs_pod; my $sleep_interval = 20; + my $ollama_pod; my @out = split(' ', script_output("kubectl get pods --namespace $namespace -o custom-columns=':metadata.name'")); record_info("Pod names", join(" ", @out)); POD_LOOP: foreach my $pod (@out) { + + if ($pod =~ /^ollama/) { + $ollama_pod = $pod; + } my $counter = 0; my $start_time = time(); while ($counter++ < $max_retries) { @@ -177,8 +183,8 @@ sub install_aistack_chart { next POD_LOOP; } else { if ($logs =~ /ERROR|FAILURE|Exception|Failed/) { - record_info("$pod failed due to error in log: $logs \n "); - push @failed_pods, {name => $pod, status => $status}; + record_info("$pod has error in log: $logs \n "); + push @issue_logs_pod, {name => $pod, status => $status}; next POD_LOOP; } # if log sleep $sleep_interval; @@ -188,6 +194,25 @@ sub install_aistack_chart { } # pod loop assert_script_run("kubectl get all --namespace $namespace"); + + # GPU check for NVIDIA_GPU_AISTACK test + if (check_var('PUBLIC_CLOUD_NVIDIA_GPU_AISTACK', 1)) { + my $ollama_log = script_output("kubectl logs $ollama_pod -n $namespace", proceed_on_failure => 1); + if ($ollama_log =~ /looking for compatible GPUs/) { + record_info("GPU compatible check in pod log $ollama_pod."); + } + if ($ollama_log =~ /no gpus found/) { + die "No GPU found for $ollama_pod\n"; + } + } + + # pod logs containing ERROR, FAILURE, or Exception, log a message indicating + # that the log has failure details and further inspection is needed + if (@issue_logs_pod) { + record_info("@issue_logs_pod log has ERROR|FAILURE|Exception check log for more details "); + } + + # Exit if there is failed pods if (@failed_pods) { die "Failed pods:\n" . join("\n", map { "$_->{name}: $_->{status}" } @failed_pods) . "\n"; } @@ -220,36 +245,35 @@ sub totest_install { } sub test_openwebui_service { - my ($instance, $namespace) = @_; + my ($instance, $namespace, $ipaddr) = @_; my $sr_name = 'open-webui'; my $host_name = get_var('OPENWEBUI_HOSTNAME'); # After successfull installation, Get open-webUI ipaddress and add in /etc/host and verify connectivity record_info('OpenWebUI service'); - assert_script_run("kubectl get ingress --namespace $namespace -o json"); - my $ipaddr = script_output("kubectl get ingress -n $namespace -o jsonpath='{.items[0].status.loadBalancer.ingress[0].ip}'"); + #assert_script_run("kubectl get ingress --namespace $namespace -o json"); + #my $ipaddr = script_output("kubectl get ingress -n $namespace -o jsonpath='{.items[0].status.loadBalancer.ingress[0].ip}'"); assert_script_run("echo \"$ipaddr $host_name\" | sudo tee -a /etc/hosts > /dev/null"); set_var('OPENWEBUI_IP', "$ipaddr"); record_info("Added $ipaddr to /etc/hosts with hostname $host_name"); # get endpoints - assert_script_run("kubectl get endpoints $sr_name -n $namespace -o=jsonpath='{.subsets[*].addresses[*].ip}'"); - my $endpoint_cmd = "kubectl get endpoints $sr_name -n $namespace -o=jsonpath='{.subsets[*].addresses[*].ip}'"; - my $endpoint_result = script_output($endpoint_cmd); - record_info("Endpoint code: $endpoint_result \n"); - if (!$endpoint_result) { - die "No healthy endpoints found for the open-webui service in $namespace\n"; + # assert_script_run("kubectl get endpoints $sr_name -n $namespace -o=jsonpath='{.subsets[*].addresses[*].ip}'"); + #my $endpoint_cmd = "kubectl get endpoints $sr_name -n $namespace -o=jsonpath='{.subsets[*].addresses[*].ip}'"; + #my $endpoint_result = script_output($endpoint_cmd); + #record_info("Endpoint code: $endpoint_result \n"); + #if (!$endpoint_result) { + # die "No healthy endpoints found for the open-webui service in $namespace\n"; + #} else { + # connect open-webui service + assert_script_run("curl --output /dev/null --silent --head --write-out \"%{http_code}\n\" -k -L https://$host_name"); + my $curl_cmd = "curl --output /dev/null --silent --head --write-out \"%{http_code}\n\" -k -L https://$host_name"; + my $curl_result = script_output($curl_cmd); + record_info("http code: $curl_result \n"); + if ($curl_result == 200) { + record_info("Successfully connected to the open-webui service at $curl_cmd \n"); } else { - # connect open-webui service - assert_script_run("curl --output /dev/null --silent --head --write-out \"%{http_code}\n\" -k -L https://$host_name"); - my $curl_cmd = "curl --output /dev/null --silent --head --write-out \"%{http_code}\n\" -k -L https://$host_name"; - my $curl_result = script_output($curl_cmd); - record_info("http code: $curl_result \n"); - if ($curl_result == 200) { - record_info("Successfully connected to the open-webui service at $curl_cmd \n"); - } else { - die "Received unexpected HTTP error code $curl_result for $curl_cmd\n"; - } + die "Received unexpected HTTP error code $curl_result for $curl_cmd\n"; } # create Admin user @@ -304,6 +328,8 @@ sub run { my $instance = $self->{my_instance} = $args->{my_instance}; my $provider = $self->{provider} = $args->{my_provider}; + my $webip = $instance->public_ip; + record_info 'Instance', join(' ', 'IP: ', $instance->public_ip); # Install dependency package, config kubectl and depnedency components install_dependency_package($instance); @@ -319,7 +345,7 @@ sub run { install_aistack_chart($instance, $ai_ns); # OpenWebUI service test - test_openwebui_service($instance, $ai_ns); + test_openwebui_service($instance, $ai_ns, $webip); record_info('End of AISTACK_BASIC'); }