Skip to content

Commit

Permalink
RHOAIENG-13625: Add DBAvailable status to CR (#304)
Browse files Browse the repository at this point in the history
* Add DBAvailable status to CR

* Remove probes
  • Loading branch information
ruivieira authored Sep 30, 2024
1 parent f366b6e commit c072d51
Show file tree
Hide file tree
Showing 10 changed files with 249 additions and 29 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,20 @@ through its `status` field. Below are the status types and reasons that are avai
| `PVCAvailable` | `PVCNotFound` | `PersistentVolumeClaim` not found. |
| `PVCAvailable` | `PVCFound` | `PersistentVolumeClaim` found. |

#### Database Status

| Status Type | Status Reason | Description |
|---------------|-------------------------|---------------------------------------------------|
| `DBAvailable` | `DBCredentialsNotFound` | Database credentials secret not found |
| `DBAvailable` | `DBCredentialsError` | Database credentials malformed (e.g. missing key) |
| `DBAvailable` | `DBConnectionError` | Service error connecting to the database |
| `DBAvailable` | `DBAvailable` | Successfully connected to the database |


#### Status Behavior

- If a PVC is not available, the `Ready` status of `TrustyAIService` will be set to `False`.
- If on database mode, any `DBAvailable` reason other than `DBAvailable` will set the `TrustyAIService` to `Not Ready`
- However, if `InferenceServices` are not found, the `Ready` status of `TrustyAIService` will not be affected, _i.e._, it is `Ready` by all other conditions, it will remain so.

## Contributing
Expand Down
2 changes: 1 addition & 1 deletion config/base/params.env
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest
trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest
oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0
kServeServerless=disabled
kServeServerless=disabled
2 changes: 1 addition & 1 deletion config/overlays/odh/params.env
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
trustyaiServiceImage=quay.io/trustyai/trustyai-service:v0.19.0
trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:v1.25.0
oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0
kServeServerless=enabled
kServeServerless=enabled
15 changes: 15 additions & 0 deletions controllers/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ const (
StatusTypePVCAvailable = "PVCAvailable"
StatusTypeRouteAvailable = "RouteAvailable"
StatusTypeAvailable = "Available"
StatusTypeDBAvailable = "DBAvailable"
)

// Status reasons
Expand All @@ -58,6 +59,10 @@ const (
StatusReasonRouteFound = "RouteFound"
StatusAvailable = "AllComponentsReady"
StatusNotAvailable = "NotAllComponentsReady"
StatusDBCredentialsNotFound = "DBCredentialsNotFound"
StatusDBCredentialsError = "DBCredentialsError"
StatusDBConnectionError = "DBConnectionError"
StatusDBAvailable = "DBAvailable"
)

// Event reasons
Expand All @@ -67,4 +72,14 @@ const (
EventReasonServiceMonitorCreated = "ServiceMonitorCreated"
)

const (
StateReasonCrashLoopBackOff = "CrashLoopBackOff"
)

// Phases
const (
PhaseReady = "Ready"
PhaseNotReady = "Not Ready"
)

const migrationAnnotationKey = "trustyai.opendatahub.io/db-migration"
63 changes: 63 additions & 0 deletions controllers/database.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package controllers

import (
"context"
"strings"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// checkDatabaseAccessible checks if the TrustyAI service pod failed with database issues.
func (r *TrustyAIServiceReconciler) checkDatabaseAccessible(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) {
deployment := &appsv1.Deployment{}
err := r.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, deployment)
if err != nil {
if errors.IsNotFound(err) {
return false, nil
}
return false, err
}

for _, cond := range deployment.Status.Conditions {
if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue {
podList := &corev1.PodList{}
listOpts := []client.ListOption{
client.InNamespace(instance.Namespace),
client.MatchingLabels(deployment.Spec.Selector.MatchLabels),
}
if err := r.List(ctx, podList, listOpts...); err != nil {
return false, err
}

for _, pod := range podList.Items {
for _, cs := range pod.Status.ContainerStatuses {
if cs.Name == "trustyai-service" {
if cs.State.Running != nil {
return true, nil
}

if cs.LastTerminationState.Terminated != nil {
termination := cs.LastTerminationState.Terminated
if termination.Reason == "Error" && termination.Message != "" {
if strings.Contains(termination.Message, "Socket fail to connect to host:address") {
return false, nil
}
}
}

if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff {
return false, nil
}
}
}
}
}
}

return false, nil
}
27 changes: 25 additions & 2 deletions controllers/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@ package controllers

import (
"context"
templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates"
"reflect"
"strconv"

templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
)

Expand Down Expand Up @@ -75,7 +77,7 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context,
_, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace)
if err != nil {
deploymentConfig.UseDBTLSCerts = false
log.FromContext(ctx).Error(err, "Using insecure database connection. Certificates "+instance.Name+"-db-tls not found")
log.FromContext(ctx).Info("Using insecure database connection. Certificates " + instance.Name + "-db-tls not found")
} else {
deploymentConfig.UseDBTLSCerts = true
log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls")
Expand Down Expand Up @@ -201,6 +203,7 @@ func (r *TrustyAIServiceReconciler) ensureDeployment(ctx context.Context, instan
return nil
}

// checkDeploymentReady verifies that a TrustyAI service deployment is ready
func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) {
deployment := &appsv1.Deployment{}

Expand All @@ -215,6 +218,26 @@ func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, in
for _, cond := range deployment.Status.Conditions {
if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue {
if deployment.Status.ReadyReplicas == *deployment.Spec.Replicas {
podList := &corev1.PodList{}
listOpts := []client.ListOption{
client.InNamespace(instance.Namespace),
client.MatchingLabels(deployment.Spec.Selector.MatchLabels),
}
if err := r.List(ctx, podList, listOpts...); err != nil {
return false, err
}

for _, pod := range podList.Items {
for _, cs := range pod.Status.ContainerStatuses {
if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff {
return false, nil
}
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
return false, nil
}
}
}

return true, nil
}
}
Expand Down
59 changes: 44 additions & 15 deletions controllers/statuses.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ import (

// IsAllReady checks if all the necessary readiness fields are true for the specific mode
func (rs *AvailabilityStatus) IsAllReady(mode string) bool {
return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) || (rs.DeploymentReady && rs.RouteReady && mode == STORAGE_DATABASE)
return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) ||
(rs.DeploymentReady && rs.RouteReady && rs.DBReady && mode == STORAGE_DATABASE)
}

// AvailabilityStatus has the readiness status of various resources.
Expand All @@ -22,6 +23,7 @@ type AvailabilityStatus struct {
DeploymentReady bool
RouteReady bool
InferenceServiceReady bool
DBReady bool
}

func (r *TrustyAIServiceReconciler) updateStatus(ctx context.Context, original *trustyaiopendatahubiov1alpha1.TrustyAIService, update func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService),
Expand Down Expand Up @@ -53,25 +55,17 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta
if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() {
// Check for PVC readiness
status.PVCReady, err = r.checkPVCReady(ctx, instance)
if err != nil || !status.PVCReady {
// PVC not ready, requeue
return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "PVC not ready")
}
}

// Check for deployment readiness
status.DeploymentReady, err = r.checkDeploymentReady(ctx, instance)
if err != nil || !status.DeploymentReady {
// Deployment not ready, requeue
return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Deployment not ready")

if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() {
status.DBReady, _ = r.checkDatabaseAccessible(ctx, instance)
}

// Check for route readiness
status.RouteReady, err = r.checkRouteReady(ctx, instance)
if err != nil || !status.RouteReady {
// Route not ready, requeue
return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Route not ready")
}

// Check if InferenceServices present
status.InferenceServiceReady, err = r.checkInferenceServicesPresent(ctx, instance.Namespace)
Expand All @@ -89,9 +83,15 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta
if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() {
UpdatePVCAvailable(saved)
}

UpdateRouteAvailable(saved)

if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() {
UpdateDBAvailable(saved)
}

UpdateTrustyAIServiceAvailable(saved)
saved.Status.Phase = "Ready"
saved.Status.Phase = PhaseReady
saved.Status.Ready = v1.ConditionTrue
})
if updateErr != nil {
Expand All @@ -114,13 +114,18 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta
}
}

if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() {
UpdateDBConnectionError(saved)
}

if status.RouteReady {
UpdateRouteAvailable(saved)
} else {
UpdateRouteNotAvailable(saved)
}

UpdateTrustyAIServiceNotAvailable(saved)
saved.Status.Phase = "Ready"
saved.Status.Phase = PhaseNotReady
saved.Status.Ready = v1.ConditionFalse
})
if updateErr != nil {
Expand All @@ -143,7 +148,7 @@ func UpdateInferenceServicePresent(saved *trustyaiopendatahubiov1alpha1.TrustyAI

func UpdatePVCNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
saved.SetStatus(StatusTypePVCAvailable, StatusReasonPVCNotFound, "PersistentVolumeClaim not found", v1.ConditionFalse)
saved.Status.Phase = "Not Ready"
saved.Status.Phase = PhaseNotReady
saved.Status.Ready = v1.ConditionFalse
}

Expand All @@ -165,4 +170,28 @@ func UpdateTrustyAIServiceAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyA

func UpdateTrustyAIServiceNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
saved.SetStatus(StatusTypeAvailable, StatusNotAvailable, "Not all components available", v1.ConditionFalse)
saved.Status.Phase = PhaseNotReady
saved.Status.Ready = v1.ConditionFalse
}

func UpdateDBCredentialsNotFound(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsNotFound, "Database credentials not found", v1.ConditionFalse)
saved.Status.Phase = PhaseNotReady
saved.Status.Ready = v1.ConditionFalse
}

func UpdateDBCredentialsError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsError, "Error with database credentials", v1.ConditionFalse)
saved.Status.Phase = PhaseNotReady
saved.Status.Ready = v1.ConditionFalse
}

func UpdateDBConnectionError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
saved.SetStatus(StatusTypeDBAvailable, StatusDBConnectionError, "Error connecting to database", v1.ConditionFalse)
saved.Status.Phase = PhaseNotReady
saved.Status.Ready = v1.ConditionFalse
}

func UpdateDBAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
saved.SetStatus(StatusTypeDBAvailable, StatusDBAvailable, "Database available", v1.ConditionTrue)
}
11 changes: 5 additions & 6 deletions controllers/statuses_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func setupAndTestStatusNoComponent(instance *trustyaiopendatahubiov1alpha1.Trust
// Call the reconcileStatuses function
_, _ = reconciler.reconcileStatuses(ctx, instance)

readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
if readyCondition != nil {
Expect(statusMatch).To(Equal(corev1.ConditionFalse), "Ready condition should be true")
Expand Down Expand Up @@ -127,7 +127,7 @@ var _ = Describe("Status and condition tests", func() {
}, instance)
}, "failed to get updated instance")

readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
if readyCondition != nil {
Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true")
Expand Down Expand Up @@ -191,7 +191,7 @@ var _ = Describe("Status and condition tests", func() {
}, instance)
}, "failed to get updated instance")

readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
if readyCondition != nil {
Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true")
Expand Down Expand Up @@ -260,8 +260,7 @@ var _ = Describe("Status and condition tests", func() {
Namespace: instance.Namespace,
}, instance)
}, "failed to get updated instance")

readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
if readyCondition != nil {
Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true")
Expand Down Expand Up @@ -344,7 +343,7 @@ var _ = Describe("Status and condition tests", func() {
}, instance)
}, "failed to get updated instance")

readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
if readyCondition != nil {
Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true")
Expand Down
Loading

0 comments on commit c072d51

Please sign in to comment.