diff --git a/README.md b/README.md index aa49524..14f1a38 100644 --- a/README.md +++ b/README.md @@ -151,10 +151,20 @@ through its `status` field. Below are the status types and reasons that are avai | `PVCAvailable` | `PVCNotFound` | `PersistentVolumeClaim` not found. | | `PVCAvailable` | `PVCFound` | `PersistentVolumeClaim` found. | +#### Database Status + +| Status Type | Status Reason | Description | +|---------------|-------------------------|---------------------------------------------------| +| `DBAvailable` | `DBCredentialsNotFound` | Database credentials secret not found | +| `DBAvailable` | `DBCredentialsError` | Database credentials malformed (e.g. missing key) | +| `DBAvailable` | `DBConnectionError` | Service error connecting to the database | +| `DBAvailable` | `DBAvailable` | Successfully connected to the database | + #### Status Behavior - If a PVC is not available, the `Ready` status of `TrustyAIService` will be set to `False`. +- If on database mode, any `DBAvailable` reason other than `DBAvailable` will set the `TrustyAIService` to `Not Ready` - However, if `InferenceServices` are not found, the `Ready` status of `TrustyAIService` will not be affected, _i.e._, it is `Ready` by all other conditions, it will remain so. ## Contributing diff --git a/config/base/params.env b/config/base/params.env index a0f5419..d745ad2 100644 --- a/config/base/params.env +++ b/config/base/params.env @@ -1,4 +1,4 @@ trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0 -kServeServerless=disabled \ No newline at end of file +kServeServerless=disabled diff --git a/config/overlays/odh/params.env b/config/overlays/odh/params.env index 908f07c..c67b2b5 100644 --- a/config/overlays/odh/params.env +++ b/config/overlays/odh/params.env @@ -1,4 +1,4 @@ trustyaiServiceImage=quay.io/trustyai/trustyai-service:v0.19.0 trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:v1.25.0 oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0 -kServeServerless=enabled \ No newline at end of file +kServeServerless=enabled diff --git a/controllers/constants.go b/controllers/constants.go index 2c7081b..18fdb4c 100644 --- a/controllers/constants.go +++ b/controllers/constants.go @@ -46,6 +46,7 @@ const ( StatusTypePVCAvailable = "PVCAvailable" StatusTypeRouteAvailable = "RouteAvailable" StatusTypeAvailable = "Available" + StatusTypeDBAvailable = "DBAvailable" ) // Status reasons @@ -58,6 +59,10 @@ const ( StatusReasonRouteFound = "RouteFound" StatusAvailable = "AllComponentsReady" StatusNotAvailable = "NotAllComponentsReady" + StatusDBCredentialsNotFound = "DBCredentialsNotFound" + StatusDBCredentialsError = "DBCredentialsError" + StatusDBConnectionError = "DBConnectionError" + StatusDBAvailable = "DBAvailable" ) // Event reasons @@ -67,4 +72,14 @@ const ( EventReasonServiceMonitorCreated = "ServiceMonitorCreated" ) +const ( + StateReasonCrashLoopBackOff = "CrashLoopBackOff" +) + +// Phases +const ( + PhaseReady = "Ready" + PhaseNotReady = "Not Ready" +) + const migrationAnnotationKey = "trustyai.opendatahub.io/db-migration" diff --git a/controllers/database.go b/controllers/database.go new file mode 100644 index 0000000..96679d0 --- /dev/null +++ b/controllers/database.go @@ -0,0 +1,63 @@ +package controllers + +import ( + "context" + "strings" + + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// checkDatabaseAccessible checks if the TrustyAI service pod failed with database issues. +func (r *TrustyAIServiceReconciler) checkDatabaseAccessible(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) { + deployment := &appsv1.Deployment{} + err := r.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, deployment) + if err != nil { + if errors.IsNotFound(err) { + return false, nil + } + return false, err + } + + for _, cond := range deployment.Status.Conditions { + if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue { + podList := &corev1.PodList{} + listOpts := []client.ListOption{ + client.InNamespace(instance.Namespace), + client.MatchingLabels(deployment.Spec.Selector.MatchLabels), + } + if err := r.List(ctx, podList, listOpts...); err != nil { + return false, err + } + + for _, pod := range podList.Items { + for _, cs := range pod.Status.ContainerStatuses { + if cs.Name == "trustyai-service" { + if cs.State.Running != nil { + return true, nil + } + + if cs.LastTerminationState.Terminated != nil { + termination := cs.LastTerminationState.Terminated + if termination.Reason == "Error" && termination.Message != "" { + if strings.Contains(termination.Message, "Socket fail to connect to host:address") { + return false, nil + } + } + } + + if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff { + return false, nil + } + } + } + } + } + } + + return false, nil +} diff --git a/controllers/deployment.go b/controllers/deployment.go index 81be18a..3b8d8f5 100644 --- a/controllers/deployment.go +++ b/controllers/deployment.go @@ -2,16 +2,18 @@ package controllers import ( "context" - templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates" "reflect" "strconv" + templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates" + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" ) @@ -75,7 +77,7 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context, _, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace) if err != nil { deploymentConfig.UseDBTLSCerts = false - log.FromContext(ctx).Error(err, "Using insecure database connection. Certificates "+instance.Name+"-db-tls not found") + log.FromContext(ctx).Info("Using insecure database connection. Certificates " + instance.Name + "-db-tls not found") } else { deploymentConfig.UseDBTLSCerts = true log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls") @@ -201,6 +203,7 @@ func (r *TrustyAIServiceReconciler) ensureDeployment(ctx context.Context, instan return nil } +// checkDeploymentReady verifies that a TrustyAI service deployment is ready func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) { deployment := &appsv1.Deployment{} @@ -215,6 +218,26 @@ func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, in for _, cond := range deployment.Status.Conditions { if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue { if deployment.Status.ReadyReplicas == *deployment.Spec.Replicas { + podList := &corev1.PodList{} + listOpts := []client.ListOption{ + client.InNamespace(instance.Namespace), + client.MatchingLabels(deployment.Spec.Selector.MatchLabels), + } + if err := r.List(ctx, podList, listOpts...); err != nil { + return false, err + } + + for _, pod := range podList.Items { + for _, cs := range pod.Status.ContainerStatuses { + if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff { + return false, nil + } + if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 { + return false, nil + } + } + } + return true, nil } } diff --git a/controllers/statuses.go b/controllers/statuses.go index b88ba47..9236e81 100644 --- a/controllers/statuses.go +++ b/controllers/statuses.go @@ -13,7 +13,8 @@ import ( // IsAllReady checks if all the necessary readiness fields are true for the specific mode func (rs *AvailabilityStatus) IsAllReady(mode string) bool { - return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) || (rs.DeploymentReady && rs.RouteReady && mode == STORAGE_DATABASE) + return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) || + (rs.DeploymentReady && rs.RouteReady && rs.DBReady && mode == STORAGE_DATABASE) } // AvailabilityStatus has the readiness status of various resources. @@ -22,6 +23,7 @@ type AvailabilityStatus struct { DeploymentReady bool RouteReady bool InferenceServiceReady bool + DBReady bool } func (r *TrustyAIServiceReconciler) updateStatus(ctx context.Context, original *trustyaiopendatahubiov1alpha1.TrustyAIService, update func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService), @@ -53,25 +55,17 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() { // Check for PVC readiness status.PVCReady, err = r.checkPVCReady(ctx, instance) - if err != nil || !status.PVCReady { - // PVC not ready, requeue - return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "PVC not ready") - } } // Check for deployment readiness status.DeploymentReady, err = r.checkDeploymentReady(ctx, instance) - if err != nil || !status.DeploymentReady { - // Deployment not ready, requeue - return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Deployment not ready") + + if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() { + status.DBReady, _ = r.checkDatabaseAccessible(ctx, instance) } // Check for route readiness status.RouteReady, err = r.checkRouteReady(ctx, instance) - if err != nil || !status.RouteReady { - // Route not ready, requeue - return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Route not ready") - } // Check if InferenceServices present status.InferenceServiceReady, err = r.checkInferenceServicesPresent(ctx, instance.Namespace) @@ -89,9 +83,15 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() { UpdatePVCAvailable(saved) } + UpdateRouteAvailable(saved) + + if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() { + UpdateDBAvailable(saved) + } + UpdateTrustyAIServiceAvailable(saved) - saved.Status.Phase = "Ready" + saved.Status.Phase = PhaseReady saved.Status.Ready = v1.ConditionTrue }) if updateErr != nil { @@ -114,13 +114,18 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta } } + if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() { + UpdateDBConnectionError(saved) + } + if status.RouteReady { UpdateRouteAvailable(saved) } else { UpdateRouteNotAvailable(saved) } + UpdateTrustyAIServiceNotAvailable(saved) - saved.Status.Phase = "Ready" + saved.Status.Phase = PhaseNotReady saved.Status.Ready = v1.ConditionFalse }) if updateErr != nil { @@ -143,7 +148,7 @@ func UpdateInferenceServicePresent(saved *trustyaiopendatahubiov1alpha1.TrustyAI func UpdatePVCNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { saved.SetStatus(StatusTypePVCAvailable, StatusReasonPVCNotFound, "PersistentVolumeClaim not found", v1.ConditionFalse) - saved.Status.Phase = "Not Ready" + saved.Status.Phase = PhaseNotReady saved.Status.Ready = v1.ConditionFalse } @@ -165,4 +170,28 @@ func UpdateTrustyAIServiceAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyA func UpdateTrustyAIServiceNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { saved.SetStatus(StatusTypeAvailable, StatusNotAvailable, "Not all components available", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBCredentialsNotFound(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsNotFound, "Database credentials not found", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBCredentialsError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsError, "Error with database credentials", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBConnectionError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBConnectionError, "Error connecting to database", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBAvailable, "Database available", v1.ConditionTrue) } diff --git a/controllers/statuses_test.go b/controllers/statuses_test.go index 9e395df..f6ad7fe 100644 --- a/controllers/statuses_test.go +++ b/controllers/statuses_test.go @@ -35,7 +35,7 @@ func setupAndTestStatusNoComponent(instance *trustyaiopendatahubiov1alpha1.Trust // Call the reconcileStatuses function _, _ = reconciler.reconcileStatuses(ctx, instance) - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionFalse), "Ready condition should be true") @@ -127,7 +127,7 @@ var _ = Describe("Status and condition tests", func() { }, instance) }, "failed to get updated instance") - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") @@ -191,7 +191,7 @@ var _ = Describe("Status and condition tests", func() { }, instance) }, "failed to get updated instance") - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") @@ -260,8 +260,7 @@ var _ = Describe("Status and condition tests", func() { Namespace: instance.Namespace, }, instance) }, "failed to get updated instance") - - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") @@ -344,7 +343,7 @@ var _ = Describe("Status and condition tests", func() { }, instance) }, "failed to get updated instance") - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") diff --git a/controllers/suite_test.go b/controllers/suite_test.go index e61f4ac..a2f16ba 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -20,11 +20,12 @@ import ( "context" "encoding/json" "fmt" - rbacv1 "k8s.io/api/rbac/v1" "path/filepath" "testing" "time" + rbacv1 "k8s.io/api/rbac/v1" + "github.com/google/uuid" kservev1alpha1 "github.com/kserve/kserve/pkg/apis/serving/v1alpha1" kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1" @@ -358,14 +359,75 @@ func makeDeploymentReady(ctx context.Context, k8sClient client.Client, instance Reason: "DeploymentReady", Message: "The deployment is ready", }, + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionTrue, + Reason: "NewReplicaSetAvailable", + Message: "ReplicaSet is progressing", + }, } if deployment.Spec.Replicas != nil { - deployment.Status.ReadyReplicas = 1 - deployment.Status.Replicas = 1 + deployment.Status.ReadyReplicas = *deployment.Spec.Replicas + deployment.Status.Replicas = *deployment.Spec.Replicas + deployment.Status.AvailableReplicas = *deployment.Spec.Replicas } - return k8sClient.Update(ctx, deployment) + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: instance.Name + "-pod", + Namespace: instance.Namespace, + Labels: deployment.Spec.Selector.MatchLabels, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "trustyai-service", + Image: "quay.io/trustyai/trustyai-service:latest", + Ports: []corev1.ContainerPort{ + { + ContainerPort: 8080, + }, + }, + }, + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }, + { + Type: corev1.ContainersReady, + Status: corev1.ConditionTrue, + }, + }, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "trustyai-service", + State: corev1.ContainerState{ + Running: &corev1.ContainerStateRunning{ + StartedAt: metav1.Now(), + }, + }, + Ready: true, + RestartCount: 0, + }, + }, + }, + } + + if err := k8sClient.Create(ctx, pod); err != nil { + return err + } + + if err := k8sClient.Status().Update(ctx, deployment); err != nil { + return err + } + + return nil } func makeRouteReady(ctx context.Context, k8sClient client.Client, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error { diff --git a/controllers/trustyaiservice_controller.go b/controllers/trustyaiservice_controller.go index d37fb26..edb60f2 100644 --- a/controllers/trustyaiservice_controller.go +++ b/controllers/trustyaiservice_controller.go @@ -25,6 +25,7 @@ import ( kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1" trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -156,10 +157,28 @@ func (r *TrustyAIServiceReconciler) Reconcile(ctx context.Context, req ctrl.Requ // Get database configuration secret, err := r.findDatabaseSecret(ctx, instance) if err != nil { + _, updateErr := r.updateStatus(ctx, instance, func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + UpdateDBCredentialsNotFound(saved) + UpdateTrustyAIServiceNotAvailable(saved) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse + }) + if updateErr != nil { + return RequeueWithErrorMessage(ctx, err, "Failed to update status") + } return RequeueWithErrorMessage(ctx, err, "Service configured to use database storage but no database configuration found.") } err = r.validateDatabaseSecret(secret) if err != nil { + _, updateErr := r.updateStatus(ctx, instance, func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + UpdateDBCredentialsError(saved) + UpdateTrustyAIServiceNotAvailable(saved) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse + }) + if updateErr != nil { + return RequeueWithErrorMessage(ctx, err, "Failed to update status") + } return RequeueWithErrorMessage(ctx, err, "Database configuration contains errors.") } }