Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RHOAIENG-13625: Add DBAvailable status to CR #304

Merged
merged 2 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,20 @@ through its `status` field. Below are the status types and reasons that are avai
| `PVCAvailable` | `PVCNotFound` | `PersistentVolumeClaim` not found. |
| `PVCAvailable` | `PVCFound` | `PersistentVolumeClaim` found. |

#### Database Status

| Status Type | Status Reason | Description |
|---------------|-------------------------|---------------------------------------------------|
| `DBAvailable` | `DBCredentialsNotFound` | Database credentials secret not found |
| `DBAvailable` | `DBCredentialsError` | Database credentials malformed (e.g. missing key) |
| `DBAvailable` | `DBConnectionError` | Service error connecting to the database |
| `DBAvailable` | `DBAvailable` | Successfully connected to the database |


#### Status Behavior

- If a PVC is not available, the `Ready` status of `TrustyAIService` will be set to `False`.
- If on database mode, any `DBAvailable` reason other than `DBAvailable` will set the `TrustyAIService` to `Not Ready`
- However, if `InferenceServices` are not found, the `Ready` status of `TrustyAIService` will not be affected, _i.e._, it is `Ready` by all other conditions, it will remain so.

## Contributing
Expand Down
2 changes: 1 addition & 1 deletion config/base/params.env
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest
trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest
oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0
kServeServerless=disabled
kServeServerless=disabled
2 changes: 1 addition & 1 deletion config/overlays/odh/params.env
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
trustyaiServiceImage=quay.io/trustyai/trustyai-service:v0.19.0
trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:v1.25.0
oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0
kServeServerless=enabled
kServeServerless=enabled
15 changes: 15 additions & 0 deletions controllers/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ const (
StatusTypePVCAvailable = "PVCAvailable"
StatusTypeRouteAvailable = "RouteAvailable"
StatusTypeAvailable = "Available"
StatusTypeDBAvailable = "DBAvailable"
)

// Status reasons
Expand All @@ -58,6 +59,10 @@ const (
StatusReasonRouteFound = "RouteFound"
StatusAvailable = "AllComponentsReady"
StatusNotAvailable = "NotAllComponentsReady"
StatusDBCredentialsNotFound = "DBCredentialsNotFound"
StatusDBCredentialsError = "DBCredentialsError"
StatusDBConnectionError = "DBConnectionError"
StatusDBAvailable = "DBAvailable"
)

// Event reasons
Expand All @@ -67,4 +72,14 @@ const (
EventReasonServiceMonitorCreated = "ServiceMonitorCreated"
)

const (
StateReasonCrashLoopBackOff = "CrashLoopBackOff"
)

// Phases
const (
PhaseReady = "Ready"
PhaseNotReady = "Not Ready"
)

const migrationAnnotationKey = "trustyai.opendatahub.io/db-migration"
63 changes: 63 additions & 0 deletions controllers/database.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package controllers

import (
"context"
"strings"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// checkDatabaseAccessible checks if the TrustyAI service pod failed with database issues.
func (r *TrustyAIServiceReconciler) checkDatabaseAccessible(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) {
deployment := &appsv1.Deployment{}
err := r.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, deployment)
if err != nil {
if errors.IsNotFound(err) {
return false, nil
}
return false, err
}

for _, cond := range deployment.Status.Conditions {
if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue {
podList := &corev1.PodList{}
listOpts := []client.ListOption{
client.InNamespace(instance.Namespace),
client.MatchingLabels(deployment.Spec.Selector.MatchLabels),
}
if err := r.List(ctx, podList, listOpts...); err != nil {
return false, err
}

for _, pod := range podList.Items {
for _, cs := range pod.Status.ContainerStatuses {
if cs.Name == "trustyai-service" {
if cs.State.Running != nil {
return true, nil
}

if cs.LastTerminationState.Terminated != nil {
termination := cs.LastTerminationState.Terminated
if termination.Reason == "Error" && termination.Message != "" {
if strings.Contains(termination.Message, "Socket fail to connect to host:address") {
return false, nil
}
}
}

if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff {
return false, nil
}
}
}
}
}
}

return false, nil
}
27 changes: 25 additions & 2 deletions controllers/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@ package controllers

import (
"context"
templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates"
"reflect"
"strconv"

templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
)

Expand Down Expand Up @@ -75,7 +77,7 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context,
_, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace)
if err != nil {
deploymentConfig.UseDBTLSCerts = false
log.FromContext(ctx).Error(err, "Using insecure database connection. Certificates "+instance.Name+"-db-tls not found")
log.FromContext(ctx).Info("Using insecure database connection. Certificates " + instance.Name + "-db-tls not found")
} else {
deploymentConfig.UseDBTLSCerts = true
log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls")
Expand Down Expand Up @@ -201,6 +203,7 @@ func (r *TrustyAIServiceReconciler) ensureDeployment(ctx context.Context, instan
return nil
}

// checkDeploymentReady verifies that a TrustyAI service deployment is ready
func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) {
deployment := &appsv1.Deployment{}

Expand All @@ -215,6 +218,26 @@ func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, in
for _, cond := range deployment.Status.Conditions {
if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue {
if deployment.Status.ReadyReplicas == *deployment.Spec.Replicas {
podList := &corev1.PodList{}
listOpts := []client.ListOption{
client.InNamespace(instance.Namespace),
client.MatchingLabels(deployment.Spec.Selector.MatchLabels),
}
if err := r.List(ctx, podList, listOpts...); err != nil {
return false, err
}

for _, pod := range podList.Items {
for _, cs := range pod.Status.ContainerStatuses {
if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff {
return false, nil
}
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
return false, nil
}
}
}

return true, nil
}
}
Expand Down
59 changes: 44 additions & 15 deletions controllers/statuses.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ import (

// IsAllReady checks if all the necessary readiness fields are true for the specific mode
func (rs *AvailabilityStatus) IsAllReady(mode string) bool {
return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) || (rs.DeploymentReady && rs.RouteReady && mode == STORAGE_DATABASE)
return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) ||
(rs.DeploymentReady && rs.RouteReady && rs.DBReady && mode == STORAGE_DATABASE)
}

// AvailabilityStatus has the readiness status of various resources.
Expand All @@ -22,6 +23,7 @@ type AvailabilityStatus struct {
DeploymentReady bool
RouteReady bool
InferenceServiceReady bool
DBReady bool
}

func (r *TrustyAIServiceReconciler) updateStatus(ctx context.Context, original *trustyaiopendatahubiov1alpha1.TrustyAIService, update func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService),
Expand Down Expand Up @@ -53,25 +55,17 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta
if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() {
// Check for PVC readiness
status.PVCReady, err = r.checkPVCReady(ctx, instance)
if err != nil || !status.PVCReady {
// PVC not ready, requeue
return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "PVC not ready")
}
}

// Check for deployment readiness
status.DeploymentReady, err = r.checkDeploymentReady(ctx, instance)
if err != nil || !status.DeploymentReady {
// Deployment not ready, requeue
return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Deployment not ready")

if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() {
status.DBReady, _ = r.checkDatabaseAccessible(ctx, instance)
}

// Check for route readiness
status.RouteReady, err = r.checkRouteReady(ctx, instance)
if err != nil || !status.RouteReady {
// Route not ready, requeue
return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Route not ready")
}

// Check if InferenceServices present
status.InferenceServiceReady, err = r.checkInferenceServicesPresent(ctx, instance.Namespace)
Expand All @@ -89,9 +83,15 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta
if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() {
UpdatePVCAvailable(saved)
}

UpdateRouteAvailable(saved)

if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() {
UpdateDBAvailable(saved)
}

UpdateTrustyAIServiceAvailable(saved)
saved.Status.Phase = "Ready"
saved.Status.Phase = PhaseReady
saved.Status.Ready = v1.ConditionTrue
})
if updateErr != nil {
Expand All @@ -114,13 +114,18 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta
}
}

if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() {
UpdateDBConnectionError(saved)
}

if status.RouteReady {
UpdateRouteAvailable(saved)
} else {
UpdateRouteNotAvailable(saved)
}

UpdateTrustyAIServiceNotAvailable(saved)
saved.Status.Phase = "Ready"
saved.Status.Phase = PhaseNotReady
saved.Status.Ready = v1.ConditionFalse
})
if updateErr != nil {
Expand All @@ -143,7 +148,7 @@ func UpdateInferenceServicePresent(saved *trustyaiopendatahubiov1alpha1.TrustyAI

func UpdatePVCNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
saved.SetStatus(StatusTypePVCAvailable, StatusReasonPVCNotFound, "PersistentVolumeClaim not found", v1.ConditionFalse)
saved.Status.Phase = "Not Ready"
saved.Status.Phase = PhaseNotReady
saved.Status.Ready = v1.ConditionFalse
}

Expand All @@ -165,4 +170,28 @@ func UpdateTrustyAIServiceAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyA

func UpdateTrustyAIServiceNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
saved.SetStatus(StatusTypeAvailable, StatusNotAvailable, "Not all components available", v1.ConditionFalse)
saved.Status.Phase = PhaseNotReady
saved.Status.Ready = v1.ConditionFalse
}

func UpdateDBCredentialsNotFound(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsNotFound, "Database credentials not found", v1.ConditionFalse)
saved.Status.Phase = PhaseNotReady
saved.Status.Ready = v1.ConditionFalse
}

func UpdateDBCredentialsError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsError, "Error with database credentials", v1.ConditionFalse)
saved.Status.Phase = PhaseNotReady
saved.Status.Ready = v1.ConditionFalse
}

func UpdateDBConnectionError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
saved.SetStatus(StatusTypeDBAvailable, StatusDBConnectionError, "Error connecting to database", v1.ConditionFalse)
saved.Status.Phase = PhaseNotReady
saved.Status.Ready = v1.ConditionFalse
}

func UpdateDBAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
saved.SetStatus(StatusTypeDBAvailable, StatusDBAvailable, "Database available", v1.ConditionTrue)
}
11 changes: 5 additions & 6 deletions controllers/statuses_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func setupAndTestStatusNoComponent(instance *trustyaiopendatahubiov1alpha1.Trust
// Call the reconcileStatuses function
_, _ = reconciler.reconcileStatuses(ctx, instance)

readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
if readyCondition != nil {
Expect(statusMatch).To(Equal(corev1.ConditionFalse), "Ready condition should be true")
Expand Down Expand Up @@ -127,7 +127,7 @@ var _ = Describe("Status and condition tests", func() {
}, instance)
}, "failed to get updated instance")

readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
if readyCondition != nil {
Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true")
Expand Down Expand Up @@ -191,7 +191,7 @@ var _ = Describe("Status and condition tests", func() {
}, instance)
}, "failed to get updated instance")

readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
if readyCondition != nil {
Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true")
Expand Down Expand Up @@ -260,8 +260,7 @@ var _ = Describe("Status and condition tests", func() {
Namespace: instance.Namespace,
}, instance)
}, "failed to get updated instance")

readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
if readyCondition != nil {
Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true")
Expand Down Expand Up @@ -344,7 +343,7 @@ var _ = Describe("Status and condition tests", func() {
}, instance)
}, "failed to get updated instance")

readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
if readyCondition != nil {
Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true")
Expand Down
Loading
Loading