From 4c8550ed13a65be7c90228fbfceb31f4bf32fe89 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Fri, 12 Jul 2024 21:47:45 +0100 Subject: [PATCH 01/24] feat: Initial database support (#246) * Initial database support - Add status checking - Add better storage flags - Add spec.storage.format validation - Add DDL -Add HIBERNATE format to DB (test) - Update service image - Revert identifier to DATABASE - Update CR options (remove mandatory data) * Remove default DDL generation env var * Update service image to latest tag * Add migration awareness * Add updating pods for migration * Change JDBC url from mysql to mariadb * Fix TLS mount * Revert images * Remove redundant logic * Fix comments --- api/v1alpha1/trustyaiservice_types.go | 39 +- ...styai.opendatahub.io_trustyaiservices.yaml | 12 +- controllers/constants.go | 11 +- controllers/deployment.go | 100 +- controllers/deployment_test.go | 877 ++++++++++++------ controllers/monitor_test.go | 2 +- controllers/route.go | 1 + controllers/route_test.go | 102 +- controllers/secrets.go | 60 ++ controllers/service_accounts_test.go | 4 +- controllers/statuses.go | 44 +- controllers/statuses_test.go | 185 +++- controllers/storage_test.go | 16 +- controllers/suite_test.go | 87 +- .../templates/service/deployment.tmpl.yaml | 73 +- controllers/trustyaiservice_controller.go | 63 +- 16 files changed, 1270 insertions(+), 406 deletions(-) create mode 100644 controllers/secrets.go diff --git a/api/v1alpha1/trustyaiservice_types.go b/api/v1alpha1/trustyaiservice_types.go index 59f2bb7..7ac65ae 100644 --- a/api/v1alpha1/trustyaiservice_types.go +++ b/api/v1alpha1/trustyaiservice_types.go @@ -34,14 +34,17 @@ type TrustyAIService struct { } type StorageSpec struct { - Format string `json:"format"` - Folder string `json:"folder"` - Size string `json:"size"` + // Format only supports "PVC" or "DATABASE" values + // +kubebuilder:validation:Enum=PVC;DATABASE + Format string `json:"format"` + Folder string `json:"folder,omitempty"` + Size string `json:"size,omitempty"` + DatabaseConfigurations string `json:"databaseConfigurations,omitempty"` } type DataSpec struct { - Filename string `json:"filename"` - Format string `json:"format"` + Filename string `json:"filename,omitempty"` + Format string `json:"format,omitempty"` } type MetricsSpec struct { @@ -55,7 +58,7 @@ type TrustyAIServiceSpec struct { // +optional Replicas *int32 `json:"replicas"` Storage StorageSpec `json:"storage"` - Data DataSpec `json:"data"` + Data DataSpec `json:"data,omitempty"` Metrics MetricsSpec `json:"metrics"` } @@ -90,6 +93,30 @@ func init() { SchemeBuilder.Register(&TrustyAIService{}, &TrustyAIServiceList{}) } +// IsDatabaseConfigurationsSet returns true if the DatabaseConfigurations field is set. +func (s *StorageSpec) IsDatabaseConfigurationsSet() bool { + return s.DatabaseConfigurations != "" +} + +// IsStoragePVC returns true if the storage is set to PVC. +func (s *StorageSpec) IsStoragePVC() bool { + return s.Format == "PVC" +} + +// IsStorageDatabase returns true if the storage is set to database. +func (s *StorageSpec) IsStorageDatabase() bool { + return s.Format == "DATABASE" +} + +// IsMigration returns true if the migration fields are set. +func (t *TrustyAIService) IsMigration() bool { + if t.Spec.Storage.Format == "DATABASE" && t.Spec.Storage.Folder != "" && t.Spec.Data.Filename != "" { + return true + } else { + return false + } +} + // SetStatus sets the status of the TrustyAIService func (t *TrustyAIService) SetStatus(condType, reason, message string, status corev1.ConditionStatus) { now := metav1.Now() diff --git a/config/crd/bases/trustyai.opendatahub.io_trustyaiservices.yaml b/config/crd/bases/trustyai.opendatahub.io_trustyaiservices.yaml index 5692159..076a808 100644 --- a/config/crd/bases/trustyai.opendatahub.io_trustyaiservices.yaml +++ b/config/crd/bases/trustyai.opendatahub.io_trustyaiservices.yaml @@ -41,9 +41,6 @@ spec: type: string format: type: string - required: - - filename - - format type: object metrics: properties: @@ -60,19 +57,22 @@ spec: type: integer storage: properties: + databaseConfigurations: + type: string folder: type: string format: + description: Format only supports "PVC" or "DATABASE" values + enum: + - PVC + - DATABASE type: string size: type: string required: - - folder - format - - size type: object required: - - data - metrics - storage type: object diff --git a/controllers/constants.go b/controllers/constants.go index adb130f..f55da3a 100644 --- a/controllers/constants.go +++ b/controllers/constants.go @@ -12,7 +12,14 @@ const ( modelMeshLabelKey = "modelmesh-service" modelMeshLabelValue = "modelmesh-serving" volumeMountName = "volume" - defaultRequeueDelay = 10 * time.Second + defaultRequeueDelay = 30 * time.Second + dbCredentialsSuffix = "-db-credentials" +) + +// Allowed storage formats +const ( + STORAGE_PVC = "PVC" + STORAGE_DATABASE = "DATABASE" ) // Configuration constants @@ -56,3 +63,5 @@ const ( EventReasonInferenceServiceConfigured = "InferenceServiceConfigured" EventReasonServiceMonitorCreated = "ServiceMonitorCreated" ) + +const migrationAnnotationKey = "trustyai.opendatahub.io/db-migration" diff --git a/controllers/deployment.go b/controllers/deployment.go index d498a2a..816c9c1 100644 --- a/controllers/deployment.go +++ b/controllers/deployment.go @@ -37,13 +37,14 @@ type DeploymentConfig struct { PVCClaimName string CustomCertificatesBundle CustomCertificatesBundle Version string + BatchSize int } // createDeploymentObject returns a Deployment for the TrustyAI Service instance func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService, serviceImage string, caBunble CustomCertificatesBundle) (*appsv1.Deployment, error) { var batchSize int - // If not batch size is provided, assume the default one + // If no batch size is provided, assume the default one if instance.Spec.Metrics.BatchSize == nil { batchSize = defaultBatchSize } else { @@ -66,6 +67,7 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context, PVCClaimName: pvcName, CustomCertificatesBundle: caBunble, Version: Version, + BatchSize: batchSize, } var deployment *appsv1.Deployment @@ -81,42 +83,78 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context, // reconcileDeployment returns a Deployment object with the same name/namespace as the cr func (r *TrustyAIServiceReconciler) createDeployment(ctx context.Context, cr *trustyaiopendatahubiov1alpha1.TrustyAIService, imageName string, caBundle CustomCertificatesBundle) error { - pvcName := generatePVCName(cr) + if !cr.Spec.Storage.IsDatabaseConfigurationsSet() { - pvc := &corev1.PersistentVolumeClaim{} - pvcerr := r.Get(ctx, types.NamespacedName{Name: pvcName, Namespace: cr.Namespace}, pvc) - if pvcerr != nil { - log.FromContext(ctx).Error(pvcerr, "PVC not found") - return pvcerr - } - if pvcerr == nil { - // The PVC is ready. We can now create the Deployment. - deployment, err := r.createDeploymentObject(ctx, cr, imageName, caBundle) - if err != nil { - // Error creating the deployment resource object - return err - } + pvcName := generatePVCName(cr) - if err := ctrl.SetControllerReference(cr, deployment, r.Scheme); err != nil { - log.FromContext(ctx).Error(err, "Error setting TrustyAIService as owner of Deployment.") - return err + pvc := &corev1.PersistentVolumeClaim{} + pvcerr := r.Get(ctx, types.NamespacedName{Name: pvcName, Namespace: cr.Namespace}, pvc) + if pvcerr != nil { + log.FromContext(ctx).Error(pvcerr, "PVC not found") + return pvcerr } - log.FromContext(ctx).Info("Creating Deployment.") - err = r.Create(ctx, deployment) - if err != nil { - log.FromContext(ctx).Error(err, "Error creating Deployment.") - return err + } + + // We can now create the Deployment. + deployment, err := r.createDeploymentObject(ctx, cr, imageName, caBundle) + if err != nil { + // Error creating the deployment resource object + return err + } + + if err := ctrl.SetControllerReference(cr, deployment, r.Scheme); err != nil { + log.FromContext(ctx).Error(err, "Error setting TrustyAIService as owner of Deployment.") + return err + } + log.FromContext(ctx).Info("Creating Deployment.") + err = r.Create(ctx, deployment) + if err != nil { + log.FromContext(ctx).Error(err, "Error creating Deployment.") + return err + } + // Created successfully + return nil + +} + +// updateDeployment returns a Deployment object with the same name/namespace as the cr +func (r *TrustyAIServiceReconciler) updateDeployment(ctx context.Context, cr *trustyaiopendatahubiov1alpha1.TrustyAIService, imageName string, caBundle CustomCertificatesBundle) error { + + if !cr.Spec.Storage.IsDatabaseConfigurationsSet() { + + pvcName := generatePVCName(cr) + + pvc := &corev1.PersistentVolumeClaim{} + pvcerr := r.Get(ctx, types.NamespacedName{Name: pvcName, Namespace: cr.Namespace}, pvc) + if pvcerr != nil { + log.FromContext(ctx).Error(pvcerr, "PVC not found") + return pvcerr } - // Created successfully - return nil + } - } else { - return ErrPVCNotReady + // We can now create the Deployment object. + deployment, err := r.createDeploymentObject(ctx, cr, imageName, caBundle) + if err != nil { + // Error creating the deployment resource object + return err + } + + if err := ctrl.SetControllerReference(cr, deployment, r.Scheme); err != nil { + log.FromContext(ctx).Error(err, "Error setting TrustyAIService as owner of Deployment.") + return err + } + log.FromContext(ctx).Info("Updating Deployment.") + err = r.Update(ctx, deployment) + if err != nil { + log.FromContext(ctx).Error(err, "Error updating Deployment.") + return err } + // Created successfully + return nil } -func (r *TrustyAIServiceReconciler) ensureDeployment(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService, caBundle CustomCertificatesBundle) error { +func (r *TrustyAIServiceReconciler) ensureDeployment(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService, caBundle CustomCertificatesBundle, migration bool) error { // Get image and tag from ConfigMap // If there's a ConfigMap with custom images, it is only applied when the operator is first deployed @@ -138,6 +176,12 @@ func (r *TrustyAIServiceReconciler) ensureDeployment(ctx context.Context, instan // Some other error occurred when trying to get the Deployment return err } + // Deployment exists, but we are migrating + if migration { + log.FromContext(ctx).Info("Found migration annotation. Migrating.") + return r.updateDeployment(ctx, instance, image, caBundle) + } + // Deployment is ready and using the PVC return nil } diff --git a/controllers/deployment_test.go b/controllers/deployment_test.go index b513ede..de80a1c 100644 --- a/controllers/deployment_test.go +++ b/controllers/deployment_test.go @@ -27,6 +27,297 @@ func printKubeObject(obj interface{}) { } } +func setupAndTestDeploymentDefault(instance *trustyaiopendatahubiov1alpha1.TrustyAIService, namespace string) { + Expect(createNamespace(ctx, k8sClient, namespace)).To(Succeed()) + caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) + + Expect(createTestPVC(ctx, k8sClient, instance)).To(Succeed()) + Expect(reconciler.createServiceAccount(ctx, instance)).To(Succeed()) + Expect(reconciler.ensureDeployment(ctx, instance, caBundle, false)).To(Succeed()) + + deployment := &appsv1.Deployment{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, deployment) + Expect(err).ToNot(HaveOccurred()) + Expect(deployment).ToNot(BeNil()) + + Expect(*deployment.Spec.Replicas).Should(Equal(int32(1))) + Expect(deployment.Namespace).Should(Equal(namespace)) + Expect(deployment.Name).Should(Equal(defaultServiceName)) + Expect(deployment.Labels["app"]).Should(Equal(defaultServiceName)) + Expect(deployment.Labels["app.kubernetes.io/name"]).Should(Equal(defaultServiceName)) + Expect(deployment.Labels["app.kubernetes.io/instance"]).Should(Equal(defaultServiceName)) + Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(componentName)) + Expect(deployment.Labels["app.kubernetes.io/version"]).Should(Equal(Version)) + + Expect(len(deployment.Spec.Template.Spec.Containers)).Should(Equal(2)) + Expect(deployment.Spec.Template.Spec.Containers[0].Image).Should(Equal("quay.io/trustyai/trustyai-service:latest")) + Expect(deployment.Spec.Template.Spec.Containers[1].Image).Should(Equal("registry.redhat.io/openshift4/ose-oauth-proxy:latest")) + + WaitFor(func() error { + service, _ := reconciler.reconcileService(ctx, instance) + return reconciler.Create(ctx, service) + }, "failed to create service") + + service := &corev1.Service{} + WaitFor(func() error { + return k8sClient.Get(ctx, types.NamespacedName{Name: defaultServiceName, Namespace: namespace}, service) + }, "failed to get Service") + + Expect(service.Annotations["prometheus.io/path"]).Should(Equal("/q/metrics")) + Expect(service.Annotations["prometheus.io/scheme"]).Should(Equal("http")) + Expect(service.Annotations["prometheus.io/scrape"]).Should(Equal("true")) + Expect(service.Namespace).Should(Equal(namespace)) + + WaitFor(func() error { + err := reconciler.reconcileOAuthService(ctx, instance, caBundle) + return err + }, "failed to create oauth service") + + desiredOAuthService, err := generateTrustyAIOAuthService(ctx, instance, caBundle) + Expect(err).ToNot(HaveOccurred()) + + oauthService := &corev1.Service{} + WaitFor(func() error { + return k8sClient.Get(ctx, types.NamespacedName{Name: desiredOAuthService.Name, Namespace: namespace}, oauthService) + }, "failed to get OAuth Service") + + // Check if the OAuth service has the expected labels + Expect(oauthService.Labels["app"]).Should(Equal(instance.Name)) + Expect(oauthService.Labels["app.kubernetes.io/instance"]).Should(Equal(instance.Name)) + Expect(oauthService.Labels["app.kubernetes.io/name"]).Should(Equal(instance.Name)) + Expect(oauthService.Labels["app.kubernetes.io/part-of"]).Should(Equal(componentName)) + Expect(oauthService.Labels["app.kubernetes.io/version"]).Should(Equal(Version)) + Expect(oauthService.Labels["trustyai-service-name"]).Should(Equal(instance.Name)) + +} + +func setupAndTestDeploymentConfigMap(instance *trustyaiopendatahubiov1alpha1.TrustyAIService, namespace string) { + serviceImage := "custom-service-image:foo" + oauthImage := "custom-oauth-proxy:bar" + Expect(createNamespace(ctx, k8sClient, namespace)).To(Succeed()) + + WaitFor(func() error { + configMap := createConfigMap(operatorNamespace, oauthImage, serviceImage) + return k8sClient.Create(ctx, configMap) + }, "failed to create ConfigMap") + + caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) + + Expect(createTestPVC(ctx, k8sClient, instance)).To(Succeed()) + Expect(reconciler.createServiceAccount(ctx, instance)).To(Succeed()) + WaitFor(func() error { + return reconciler.ensureDeployment(ctx, instance, caBundle, false) + }, "failed to reconcile deployment") + + deployment := &appsv1.Deployment{} + WaitFor(func() error { + return k8sClient.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: namespace}, deployment) + }, "failed to get updated deployment") + Expect(deployment).ToNot(BeNil()) + + Expect(*deployment.Spec.Replicas).Should(Equal(int32(1))) + Expect(deployment.Namespace).Should(Equal(namespace)) + Expect(deployment.Name).Should(Equal(defaultServiceName)) + Expect(deployment.Labels["app"]).Should(Equal(defaultServiceName)) + Expect(deployment.Labels["app.kubernetes.io/name"]).Should(Equal(defaultServiceName)) + Expect(deployment.Labels["app.kubernetes.io/instance"]).Should(Equal(defaultServiceName)) + Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(componentName)) + Expect(deployment.Labels["app.kubernetes.io/version"]).Should(Equal(Version)) + + Expect(len(deployment.Spec.Template.Spec.Containers)).Should(Equal(2)) + Expect(deployment.Spec.Template.Spec.Containers[0].Image).Should(Equal(serviceImage)) + Expect(deployment.Spec.Template.Spec.Containers[1].Image).Should(Equal(oauthImage)) + + WaitFor(func() error { + service, _ := reconciler.reconcileService(ctx, instance) + return reconciler.Create(ctx, service) + }, "failed to create service") + + service := &corev1.Service{} + WaitFor(func() error { + return k8sClient.Get(ctx, types.NamespacedName{Name: defaultServiceName, Namespace: namespace}, service) + }, "failed to get Service") + + Expect(service.Annotations["prometheus.io/path"]).Should(Equal("/q/metrics")) + Expect(service.Annotations["prometheus.io/scheme"]).Should(Equal("http")) + Expect(service.Annotations["prometheus.io/scrape"]).Should(Equal("true")) + Expect(service.Namespace).Should(Equal(namespace)) + + WaitFor(func() error { + err := reconciler.reconcileOAuthService(ctx, instance, caBundle) + return err + }, "failed to create oauth service") + + desiredOAuthService, err := generateTrustyAIOAuthService(ctx, instance, caBundle) + Expect(err).ToNot(HaveOccurred()) + + oauthService := &corev1.Service{} + WaitFor(func() error { + return k8sClient.Get(ctx, types.NamespacedName{Name: desiredOAuthService.Name, Namespace: namespace}, oauthService) + }, "failed to get OAuth Service") + + // Check if the OAuth service has the expected labels + Expect(oauthService.Labels["app"]).Should(Equal(instance.Name)) + Expect(oauthService.Labels["app.kubernetes.io/instance"]).Should(Equal(instance.Name)) + Expect(oauthService.Labels["app.kubernetes.io/name"]).Should(Equal(instance.Name)) + Expect(oauthService.Labels["app.kubernetes.io/part-of"]).Should(Equal(componentName)) + Expect(oauthService.Labels["app.kubernetes.io/version"]).Should(Equal(Version)) + Expect(oauthService.Labels["trustyai-service-name"]).Should(Equal(instance.Name)) + +} + +func setupAndTestDeploymentNoCustomCABundle(instance *trustyaiopendatahubiov1alpha1.TrustyAIService, namespace string) { + Expect(createNamespace(ctx, k8sClient, namespace)).To(Succeed()) + + caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) + + Expect(createTestPVC(ctx, k8sClient, instance)).To(Succeed()) + Expect(reconciler.createServiceAccount(ctx, instance)).To(Succeed()) + WaitFor(func() error { + return reconciler.ensureDeployment(ctx, instance, caBundle, false) + }, "failed to create deployment") + + deployment := &appsv1.Deployment{} + namespacedName := types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace} + Expect(k8sClient.Get(ctx, namespacedName, deployment)).Should(Succeed()) + + Expect(deployment.Spec.Template.Spec.ServiceAccountName).To(Equal(instance.Name + "-proxy")) + + customCertificatesBundleVolumeName := caBundle + for _, volume := range deployment.Spec.Template.Spec.Volumes { + Expect(volume.Name).ToNot(Equal(customCertificatesBundleVolumeName)) + } + + for _, container := range deployment.Spec.Template.Spec.Containers { + for _, volumeMount := range container.VolumeMounts { + Expect(volumeMount.Name).ToNot(Equal(customCertificatesBundleVolumeName)) + } + for _, arg := range container.Args { + Expect(arg).ToNot(ContainSubstring("--openshift-ca")) + } + } + +} + +func setupAndTestDeploymentCustomCABundle(instance *trustyaiopendatahubiov1alpha1.TrustyAIService, namespace string) { + caBundleConfigMap := createTrustedCABundleConfigMap(namespace) + Expect(createNamespace(ctx, k8sClient, namespace)).To(Succeed()) + Expect(k8sClient.Create(ctx, caBundleConfigMap)).To(Succeed()) + + caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) + + Expect(createTestPVC(ctx, k8sClient, instance)).To(Succeed()) + Expect(reconciler.createServiceAccount(ctx, instance)).To(Succeed()) + WaitFor(func() error { + return reconciler.ensureDeployment(ctx, instance, caBundle, false) + }, "failed to create deployment") + + deployment := &appsv1.Deployment{} + namespacedName := types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace} + Expect(k8sClient.Get(ctx, namespacedName, deployment)).Should(Succeed()) + + Expect(deployment.Spec.Template.Spec.ServiceAccountName).To(Equal(instance.Name + "-proxy")) + + foundCustomCertificatesBundleVolumeMount := false + + customCertificatesBundleMountPath := "/etc/ssl/certs/ca-bundle.crt" + for _, container := range deployment.Spec.Template.Spec.Containers { + for _, volumeMount := range container.VolumeMounts { + if volumeMount.Name == caBundleName && volumeMount.MountPath == customCertificatesBundleMountPath { + foundCustomCertificatesBundleVolumeMount = true + } + } + } + Expect(foundCustomCertificatesBundleVolumeMount).To(BeTrue(), caBundleName+" volume mount not found in any container") + + Expect(k8sClient.Delete(ctx, caBundleConfigMap)).To(Succeed(), "failed to delete custom certificates bundle ConfigMap") + +} + +func setupAndTestDeploymentServiceAccount(instance *trustyaiopendatahubiov1alpha1.TrustyAIService, namespace string, mode string) { + Expect(createNamespace(ctx, k8sClient, namespace)).To(Succeed()) + + caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) + + if mode == "PVC" { + Expect(createTestPVC(ctx, k8sClient, instance)).To(Succeed()) + } + Expect(reconciler.createServiceAccount(ctx, instance)).To(Succeed()) + WaitFor(func() error { + return reconciler.ensureDeployment(ctx, instance, caBundle, false) + }, "failed to create deployment") + + deployment := &appsv1.Deployment{} + namespacedName := types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace} + Expect(k8sClient.Get(ctx, namespacedName, deployment)).Should(Succeed()) + + Expect(deployment.Spec.Template.Spec.ServiceAccountName).To(Equal(instance.Name + "-proxy")) +} + +func setupAndTestDeploymentInferenceService(instance *trustyaiopendatahubiov1alpha1.TrustyAIService, namespace string, mode string) { + WaitFor(func() error { + return createNamespace(ctx, k8sClient, namespace) + }, "failed to create namespace") + + caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) + + WaitFor(func() error { + return createTestPVC(ctx, k8sClient, instance) + }, "failed to create PVC") + WaitFor(func() error { + return reconciler.ensureDeployment(ctx, instance, caBundle, false) + }, "failed to create deployment") + + // Creating the InferenceService + inferenceService := createInferenceService("my-model", namespace) + WaitFor(func() error { + return k8sClient.Create(ctx, inferenceService) + }, "failed to create deployment") + + Expect(reconciler.patchKServe(ctx, instance, *inferenceService, namespace, instance.Name, false)).ToNot(HaveOccurred()) + + deployment := &appsv1.Deployment{} + WaitFor(func() error { + // Define defaultServiceName for the deployment created by the operator + namespacedNamed := types.NamespacedName{ + Namespace: namespace, + Name: instance.Name, + } + return k8sClient.Get(ctx, namespacedNamed, deployment) + }, "failed to get Deployment") + + Expect(*deployment.Spec.Replicas).Should(Equal(int32(1))) + Expect(deployment.Namespace).Should(Equal(namespace)) + Expect(deployment.Name).Should(Equal(defaultServiceName)) + Expect(deployment.Labels["app"]).Should(Equal(defaultServiceName)) + Expect(deployment.Labels["app.kubernetes.io/name"]).Should(Equal(defaultServiceName)) + Expect(deployment.Labels["app.kubernetes.io/instance"]).Should(Equal(defaultServiceName)) + Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(componentName)) + Expect(deployment.Labels["app.kubernetes.io/version"]).Should(Equal(Version)) + + WaitFor(func() error { + err := reconciler.reconcileOAuthService(ctx, instance, caBundle) + return err + }, "failed to create oauth service") + + desiredOAuthService, err := generateTrustyAIOAuthService(ctx, instance, caBundle) + Expect(err).ToNot(HaveOccurred()) + + oauthService := &corev1.Service{} + WaitFor(func() error { + return k8sClient.Get(ctx, types.NamespacedName{Name: desiredOAuthService.Name, Namespace: namespace}, oauthService) + }, "failed to get OAuth Service") + + // Check if the OAuth service has the expected labels + Expect(oauthService.Labels["app"]).Should(Equal(instance.Name)) + Expect(oauthService.Labels["app.kubernetes.io/instance"]).Should(Equal(instance.Name)) + Expect(oauthService.Labels["app.kubernetes.io/name"]).Should(Equal(instance.Name)) + Expect(oauthService.Labels["app.kubernetes.io/part-of"]).Should(Equal(componentName)) + Expect(oauthService.Labels["app.kubernetes.io/version"]).Should(Equal(Version)) + Expect(oauthService.Labels["trustyai-service-name"]).Should(Equal(instance.Name)) + +} + var _ = Describe("TrustyAI operator", func() { BeforeEach(func() { @@ -59,173 +350,153 @@ var _ = Describe("TrustyAI operator", func() { Context("When deploying with default settings without an InferenceService", func() { var instance *trustyaiopendatahubiov1alpha1.TrustyAIService + It("Creates a deployment and a service with the default configuration in PVC-mode", func() { + namespace := "trusty-ns-a-1-pvc" + instance = createDefaultPVCCustomResource(namespace) + setupAndTestDeploymentDefault(instance, namespace) + }) + It("Creates a deployment and a service with the default configuration in DB-mode (mysql)", func() { + namespace := "trusty-ns-a-1-db" + instance = createDefaultDBCustomResource(namespace) + WaitFor(func() error { + secret := createDatabaseConfiguration(namespace, defaultDatabaseConfigurationName, "mysql") + return k8sClient.Create(ctx, secret) + }, "failed to create ConfigMap") + setupAndTestDeploymentDefault(instance, namespace) + }) + It("Creates a deployment and a service with the default configuration in DB-mode (mariadb)", func() { + namespace := "trusty-ns-a-1-db" + instance = createDefaultDBCustomResource(namespace) + WaitFor(func() error { + secret := createDatabaseConfiguration(namespace, defaultDatabaseConfigurationName, "mariadb") + return k8sClient.Create(ctx, secret) + }, "failed to create ConfigMap") + setupAndTestDeploymentDefault(instance, namespace) + }) + + }) - It("Creates a deployment and a service with the default configuration", func() { + Context("When deploying with a ConfigMap and without an InferenceService", func() { + var instance *trustyaiopendatahubiov1alpha1.TrustyAIService + + It("Creates a deployment and a service with the ConfigMap configuration in PVC-mode", func() { + namespace := "trusty-ns-a-1-cm-pvc" + instance = createDefaultPVCCustomResource(namespace) + setupAndTestDeploymentConfigMap(instance, namespace) + }) + It("Creates a deployment and a service with the ConfigMap configuration in DB-mode", func() { + namespace := "trusty-ns-a-1-cm-db" + instance = createDefaultDBCustomResource(namespace) + setupAndTestDeploymentConfigMap(instance, namespace) + }) + + }) - namespace := "trusty-ns-a-1" + Context("When deploying with default settings without an InferenceService", func() { + var instance *trustyaiopendatahubiov1alpha1.TrustyAIService + + It("should set environment variables correctly in PVC mode", func() { + + namespace := "trusty-ns-a-4-pvc" + instance = createDefaultPVCCustomResource(namespace) + //printKubeObject(instance) Expect(createNamespace(ctx, k8sClient, namespace)).To(Succeed()) - instance = createDefaultCR(namespace) caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) Expect(createTestPVC(ctx, k8sClient, instance)).To(Succeed()) Expect(reconciler.createServiceAccount(ctx, instance)).To(Succeed()) - Expect(reconciler.ensureDeployment(ctx, instance, caBundle)).To(Succeed()) + Expect(reconciler.ensureDeployment(ctx, instance, caBundle, false)).To(Succeed()) deployment := &appsv1.Deployment{} - err := k8sClient.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, deployment) - Expect(err).ToNot(HaveOccurred()) - Expect(deployment).ToNot(BeNil()) - - Expect(*deployment.Spec.Replicas).Should(Equal(int32(1))) - Expect(deployment.Namespace).Should(Equal(namespace)) - Expect(deployment.Name).Should(Equal(defaultServiceName)) - Expect(deployment.Labels["app"]).Should(Equal(defaultServiceName)) - Expect(deployment.Labels["app.kubernetes.io/name"]).Should(Equal(defaultServiceName)) - Expect(deployment.Labels["app.kubernetes.io/instance"]).Should(Equal(defaultServiceName)) - Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(componentName)) - Expect(deployment.Labels["app.kubernetes.io/version"]).Should(Equal(Version)) - - Expect(len(deployment.Spec.Template.Spec.Containers)).Should(Equal(2)) - Expect(deployment.Spec.Template.Spec.Containers[0].Image).Should(Equal("quay.io/trustyai/trustyai-service:latest")) - Expect(deployment.Spec.Template.Spec.Containers[1].Image).Should(Equal("registry.redhat.io/openshift4/ose-oauth-proxy:latest")) - - WaitFor(func() error { - service, _ := reconciler.reconcileService(ctx, instance) - return reconciler.Create(ctx, service) - }, "failed to create service") - - service := &corev1.Service{} - WaitFor(func() error { - return k8sClient.Get(ctx, types.NamespacedName{Name: defaultServiceName, Namespace: namespace}, service) - }, "failed to get Service") - - Expect(service.Annotations["prometheus.io/path"]).Should(Equal("/q/metrics")) - Expect(service.Annotations["prometheus.io/scheme"]).Should(Equal("http")) - Expect(service.Annotations["prometheus.io/scrape"]).Should(Equal("true")) - Expect(service.Namespace).Should(Equal(namespace)) - - WaitFor(func() error { - err := reconciler.reconcileOAuthService(ctx, instance, caBundle) - return err - }, "failed to create oauth service") + namespacedName := types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace} + Expect(k8sClient.Get(ctx, namespacedName, deployment)).Should(Succeed()) - desiredOAuthService, err := generateTrustyAIOAuthService(ctx, instance, caBundle) - Expect(err).ToNot(HaveOccurred()) + //printKubeObject(deployment) - oauthService := &corev1.Service{} - WaitFor(func() error { - return k8sClient.Get(ctx, types.NamespacedName{Name: desiredOAuthService.Name, Namespace: namespace}, oauthService) - }, "failed to get OAuth Service") + foundEnvVar := func(envVars []corev1.EnvVar, name string) *corev1.EnvVar { + for _, env := range envVars { + if env.Name == name { + return &env + } + } + return nil + } - // Check if the OAuth service has the expected labels - Expect(oauthService.Labels["app"]).Should(Equal(instance.Name)) - Expect(oauthService.Labels["app.kubernetes.io/instance"]).Should(Equal(instance.Name)) - Expect(oauthService.Labels["app.kubernetes.io/name"]).Should(Equal(instance.Name)) - Expect(oauthService.Labels["app.kubernetes.io/part-of"]).Should(Equal(componentName)) - Expect(oauthService.Labels["app.kubernetes.io/version"]).Should(Equal(Version)) - Expect(oauthService.Labels["trustyai-service-name"]).Should(Equal(instance.Name)) + var trustyaiServiceContainer *corev1.Container + for _, container := range deployment.Spec.Template.Spec.Containers { + if container.Name == "trustyai-service" { + trustyaiServiceContainer = &container + break + } + } - }) - }) + Expect(trustyaiServiceContainer).NotTo(BeNil(), "trustyai-service container not found") - Context("When deploying with a ConfigMap and without an InferenceService", func() { - var instance *trustyaiopendatahubiov1alpha1.TrustyAIService + // Checking the environment variables of the trustyai-service container + var envVar *corev1.EnvVar - It("Creates a deployment and a service with the ConfigMap configuration", func() { + //envVar = foundEnvVar(trustyaiServiceContainer.Env, "SERVICE_BATCH_SIZE") + //Expect(envVar).NotTo(BeNil(), "Env var SERVICE_BATCH_SIZE not found") + //Expect(envVar.Value).To(Equal("5000")) - namespace := "trusty-ns-a-1-cm" - serviceImage := "custom-service-image:foo" - oauthImage := "custom-oauth-proxy:bar" - Expect(createNamespace(ctx, k8sClient, namespace)).To(Succeed()) + envVar = foundEnvVar(trustyaiServiceContainer.Env, "STORAGE_DATA_FILENAME") + Expect(envVar).NotTo(BeNil(), "Env var STORAGE_DATA_FILENAME not found") + Expect(envVar.Value).To(Equal("data.csv")) - WaitFor(func() error { - configMap := createConfigMap(operatorNamespace, oauthImage, serviceImage) - return k8sClient.Create(ctx, configMap) - }, "failed to create ConfigMap") + envVar = foundEnvVar(trustyaiServiceContainer.Env, "SERVICE_STORAGE_FORMAT") + Expect(envVar).NotTo(BeNil(), "Env var SERVICE_STORAGE_FORMAT not found") + Expect(envVar.Value).To(Equal("PVC")) - instance = createDefaultCR(namespace) + envVar = foundEnvVar(trustyaiServiceContainer.Env, "STORAGE_DATA_FOLDER") + Expect(envVar).NotTo(BeNil(), "Env var STORAGE_DATA_FOLDER not found") + Expect(envVar.Value).To(Equal("/data")) - caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) + envVar = foundEnvVar(trustyaiServiceContainer.Env, "SERVICE_DATA_FORMAT") + Expect(envVar).NotTo(BeNil(), "Env var SERVICE_DATA_FORMAT not found") + Expect(envVar.Value).To(Equal("CSV")) - Expect(createTestPVC(ctx, k8sClient, instance)).To(Succeed()) - Expect(reconciler.createServiceAccount(ctx, instance)).To(Succeed()) - WaitFor(func() error { - return reconciler.ensureDeployment(ctx, instance, caBundle) - }, "failed to reconcile deployment") + envVar = foundEnvVar(trustyaiServiceContainer.Env, "SERVICE_METRICS_SCHEDULE") + Expect(envVar).NotTo(BeNil(), "Env var SERVICE_METRICS_SCHEDULE not found") + Expect(envVar.Value).To(Equal("5s")) - deployment := &appsv1.Deployment{} - WaitFor(func() error { - return k8sClient.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: namespace}, deployment) - }, "failed to get updated deployment") - Expect(deployment).ToNot(BeNil()) - - Expect(*deployment.Spec.Replicas).Should(Equal(int32(1))) - Expect(deployment.Namespace).Should(Equal(namespace)) - Expect(deployment.Name).Should(Equal(defaultServiceName)) - Expect(deployment.Labels["app"]).Should(Equal(defaultServiceName)) - Expect(deployment.Labels["app.kubernetes.io/name"]).Should(Equal(defaultServiceName)) - Expect(deployment.Labels["app.kubernetes.io/instance"]).Should(Equal(defaultServiceName)) - Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(componentName)) - Expect(deployment.Labels["app.kubernetes.io/version"]).Should(Equal(Version)) - - Expect(len(deployment.Spec.Template.Spec.Containers)).Should(Equal(2)) - Expect(deployment.Spec.Template.Spec.Containers[0].Image).Should(Equal(serviceImage)) - Expect(deployment.Spec.Template.Spec.Containers[1].Image).Should(Equal(oauthImage)) + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_HIBERNATE_ORM_ACTIVE") + Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_HIBERNATE_ORM_ACTIVE not found") + Expect(envVar.Value).To(Equal("false")) - WaitFor(func() error { - service, _ := reconciler.reconcileService(ctx, instance) - return reconciler.Create(ctx, service) - }, "failed to create service") + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_DB_KIND") + Expect(envVar).To(BeNil()) - service := &corev1.Service{} - WaitFor(func() error { - return k8sClient.Get(ctx, types.NamespacedName{Name: defaultServiceName, Namespace: namespace}, service) - }, "failed to get Service") + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_JDBC_MAX_SIZE") + Expect(envVar).To(BeNil()) - Expect(service.Annotations["prometheus.io/path"]).Should(Equal("/q/metrics")) - Expect(service.Annotations["prometheus.io/scheme"]).Should(Equal("http")) - Expect(service.Annotations["prometheus.io/scrape"]).Should(Equal("true")) - Expect(service.Namespace).Should(Equal(namespace)) + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_USERNAME") + Expect(envVar).To(BeNil()) - WaitFor(func() error { - err := reconciler.reconcileOAuthService(ctx, instance, caBundle) - return err - }, "failed to create oauth service") + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_PASSWORD") + Expect(envVar).To(BeNil()) - desiredOAuthService, err := generateTrustyAIOAuthService(ctx, instance, caBundle) - Expect(err).ToNot(HaveOccurred()) + envVar = foundEnvVar(trustyaiServiceContainer.Env, "DATABASE_SERVICE") + Expect(envVar).To(BeNil()) - oauthService := &corev1.Service{} - WaitFor(func() error { - return k8sClient.Get(ctx, types.NamespacedName{Name: desiredOAuthService.Name, Namespace: namespace}, oauthService) - }, "failed to get OAuth Service") + envVar = foundEnvVar(trustyaiServiceContainer.Env, "DATABASE_PORT") + Expect(envVar).To(BeNil()) - // Check if the OAuth service has the expected labels - Expect(oauthService.Labels["app"]).Should(Equal(instance.Name)) - Expect(oauthService.Labels["app.kubernetes.io/instance"]).Should(Equal(instance.Name)) - Expect(oauthService.Labels["app.kubernetes.io/name"]).Should(Equal(instance.Name)) - Expect(oauthService.Labels["app.kubernetes.io/part-of"]).Should(Equal(componentName)) - Expect(oauthService.Labels["app.kubernetes.io/version"]).Should(Equal(Version)) - Expect(oauthService.Labels["trustyai-service-name"]).Should(Equal(instance.Name)) + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_JDBC_URL") + Expect(envVar).To(BeNil()) }) - }) - Context("When deploying with default settings without an InferenceService", func() { - var instance *trustyaiopendatahubiov1alpha1.TrustyAIService - - It("should set environment variables correctly", func() { + It("should set environment variables correctly in DB mode", func() { - namespace := "trusty-ns-a-4" - instance = createDefaultCR(namespace) + namespace := "trusty-ns-a-4-db" + instance = createDefaultDBCustomResource(namespace) Expect(createNamespace(ctx, k8sClient, namespace)).To(Succeed()) - caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) Expect(createTestPVC(ctx, k8sClient, instance)).To(Succeed()) Expect(reconciler.createServiceAccount(ctx, instance)).To(Succeed()) - Expect(reconciler.ensureDeployment(ctx, instance, caBundle)).To(Succeed()) + Expect(reconciler.ensureDeployment(ctx, instance, caBundle, false)).To(Succeed()) deployment := &appsv1.Deployment{} namespacedName := types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace} @@ -253,135 +524,255 @@ var _ = Describe("TrustyAI operator", func() { // Checking the environment variables of the trustyai-service container var envVar *corev1.EnvVar - envVar = foundEnvVar(trustyaiServiceContainer.Env, "SERVICE_BATCH_SIZE") - Expect(envVar).NotTo(BeNil(), "Env var SERVICE_BATCH_SIZE not found") - Expect(envVar.Value).To(Equal("5000")) - envVar = foundEnvVar(trustyaiServiceContainer.Env, "STORAGE_DATA_FILENAME") - Expect(envVar).NotTo(BeNil(), "Env var STORAGE_DATA_FILENAME not found") - Expect(envVar.Value).To(Equal("data.csv")) + Expect(envVar).To(BeNil()) envVar = foundEnvVar(trustyaiServiceContainer.Env, "SERVICE_STORAGE_FORMAT") Expect(envVar).NotTo(BeNil(), "Env var SERVICE_STORAGE_FORMAT not found") - Expect(envVar.Value).To(Equal("PVC")) + Expect(envVar.Value).To(Equal(STORAGE_DATABASE)) envVar = foundEnvVar(trustyaiServiceContainer.Env, "STORAGE_DATA_FOLDER") - Expect(envVar).NotTo(BeNil(), "Env var STORAGE_DATA_FOLDER not found") - Expect(envVar.Value).To(Equal("/data")) + Expect(envVar).To(BeNil()) - envVar = foundEnvVar(trustyaiServiceContainer.Env, "SERVICE_DATA_FORMAT") - Expect(envVar).NotTo(BeNil(), "Env var SERVICE_DATA_FORMAT not found") - Expect(envVar.Value).To(Equal("CSV")) + //envVar = foundEnvVar(trustyaiServiceContainer.Env, "SERVICE_DATA_FORMAT") + //Expect(envVar).To(BeNil()) envVar = foundEnvVar(trustyaiServiceContainer.Env, "SERVICE_METRICS_SCHEDULE") Expect(envVar).NotTo(BeNil(), "Env var SERVICE_METRICS_SCHEDULE not found") Expect(envVar.Value).To(Equal("5s")) - }) - }) - Context("When deploying with no custom CA bundle ConfigMap", func() { - var instance *trustyaiopendatahubiov1alpha1.TrustyAIService + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_HIBERNATE_ORM_ACTIVE") + Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_HIBERNATE_ORM_ACTIVE not found") + Expect(envVar.Value).To(Equal("true")) + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_DB_KIND") + Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_DB_KIND not found") + Expect(envVar.ValueFrom).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_DB_KIND does not have ValueFrom set") + Expect(envVar.ValueFrom.SecretKeyRef).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_DB_KIND is not using SecretKeyRef") + Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") + Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databaseKind"), "Secret key does not match") + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_JDBC_MAX_SIZE") + Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_JDBC_MAX_SIZE not found") + Expect(envVar.Value).To(Equal("16")) + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_USERNAME") + Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_USERNAME not found") + Expect(envVar.ValueFrom).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_USERNAME does not have ValueFrom set") + Expect(envVar.ValueFrom.SecretKeyRef).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_USERNAME is not using SecretKeyRef") + Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") + Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databaseUsername"), "Secret key does not match") + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_PASSWORD") + Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_PASSWORD not found") + Expect(envVar.ValueFrom).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_PASSWORD does not have ValueFrom set") + Expect(envVar.ValueFrom.SecretKeyRef).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_PASSWORD is not using SecretKeyRef") + Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") + Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databasePassword"), "Secret key does not match") + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "DATABASE_SERVICE") + Expect(envVar).NotTo(BeNil(), "Env var DATABASE_SERVICE not found") + Expect(envVar.ValueFrom).NotTo(BeNil(), "Env var DATABASE_SERVICE does not have ValueFrom set") + Expect(envVar.ValueFrom.SecretKeyRef).NotTo(BeNil(), "Env var DATABASE_SERVICE is not using SecretKeyRef") + Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") + Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databaseService"), "Secret key does not match") + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "DATABASE_PORT") + Expect(envVar).NotTo(BeNil(), "Env var DATABASE_PORT not found") + Expect(envVar.ValueFrom).NotTo(BeNil(), "Env var DATABASE_PORT does not have ValueFrom set") + Expect(envVar.ValueFrom.SecretKeyRef).NotTo(BeNil(), "Env var DATABASE_PORT is not using SecretKeyRef") + Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") + Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databasePort"), "Secret key does not match") + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_JDBC_URL") + Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_JDBC_URL not found") + Expect(envVar.Value).To(Equal("jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/trustyai_database")) + + }) - It("should use the correct service account and not include CustomCertificatesBundle", func() { + It("should set environment variables correctly in migration mode", func() { - namespace := "trusty-ns-a-7" - instance = createDefaultCR(namespace) + namespace := "trusty-ns-a-4-migration" + instance = createDefaultMigrationCustomResource(namespace) Expect(createNamespace(ctx, k8sClient, namespace)).To(Succeed()) - + //printKubeObject(instance) caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) Expect(createTestPVC(ctx, k8sClient, instance)).To(Succeed()) Expect(reconciler.createServiceAccount(ctx, instance)).To(Succeed()) - WaitFor(func() error { - return reconciler.ensureDeployment(ctx, instance, caBundle) - }, "failed to create deployment") + Expect(reconciler.ensureDeployment(ctx, instance, caBundle, false)).To(Succeed()) deployment := &appsv1.Deployment{} namespacedName := types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace} Expect(k8sClient.Get(ctx, namespacedName, deployment)).Should(Succeed()) - Expect(deployment.Spec.Template.Spec.ServiceAccountName).To(Equal(instance.Name + "-proxy")) - - customCertificatesBundleVolumeName := caBundle - for _, volume := range deployment.Spec.Template.Spec.Volumes { - Expect(volume.Name).ToNot(Equal(customCertificatesBundleVolumeName)) + foundEnvVar := func(envVars []corev1.EnvVar, name string) *corev1.EnvVar { + for _, env := range envVars { + if env.Name == name { + return &env + } + } + return nil } + var trustyaiServiceContainer *corev1.Container for _, container := range deployment.Spec.Template.Spec.Containers { - for _, volumeMount := range container.VolumeMounts { - Expect(volumeMount.Name).ToNot(Equal(customCertificatesBundleVolumeName)) - } - for _, arg := range container.Args { - Expect(arg).ToNot(ContainSubstring("--openshift-ca")) + if container.Name == "trustyai-service" { + trustyaiServiceContainer = &container + break } } + + Expect(trustyaiServiceContainer).NotTo(BeNil(), "trustyai-service container not found") + + // Checking the environment variables of the trustyai-service container + var envVar *corev1.EnvVar + + //envVar = foundEnvVar(trustyaiServiceContainer.Env, "SERVICE_BATCH_SIZE") + //Expect(envVar).To(BeNil(), "Env var SERVICE_BATCH_SIZE not found") + //Expect(envVar.Value).To(Equal("5000")) + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "STORAGE_DATA_FILENAME") + Expect(envVar).ToNot(BeNil()) + Expect(envVar.Value).To(Equal("data.csv")) + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "SERVICE_STORAGE_FORMAT") + Expect(envVar).NotTo(BeNil(), "Env var SERVICE_STORAGE_FORMAT not found") + Expect(envVar.Value).To(Equal(STORAGE_DATABASE)) + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "STORAGE_DATA_FOLDER") + Expect(envVar).ToNot(BeNil()) + Expect(envVar.Value).To(Equal("/data")) + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "SERVICE_DATA_FORMAT") + Expect(envVar).ToNot(BeNil()) + Expect(envVar.Value).To(Equal("CSV")) + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "SERVICE_METRICS_SCHEDULE") + Expect(envVar).NotTo(BeNil(), "Env var SERVICE_METRICS_SCHEDULE not found") + Expect(envVar.Value).To(Equal("5s")) + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_HIBERNATE_ORM_ACTIVE") + Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_HIBERNATE_ORM_ACTIVE not found") + Expect(envVar.Value).To(Equal("true")) + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_DB_KIND") + Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_DB_KIND not found") + Expect(envVar.ValueFrom).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_DB_KIND does not have ValueFrom set") + Expect(envVar.ValueFrom.SecretKeyRef).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_DB_KIND is not using SecretKeyRef") + Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") + Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databaseKind"), "Secret key does not match") + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_JDBC_MAX_SIZE") + Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_JDBC_MAX_SIZE not found") + Expect(envVar.Value).To(Equal("16")) + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_USERNAME") + Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_USERNAME not found") + Expect(envVar.ValueFrom).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_USERNAME does not have ValueFrom set") + Expect(envVar.ValueFrom.SecretKeyRef).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_USERNAME is not using SecretKeyRef") + Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") + Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databaseUsername"), "Secret key does not match") + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_PASSWORD") + Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_PASSWORD not found") + Expect(envVar.ValueFrom).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_PASSWORD does not have ValueFrom set") + Expect(envVar.ValueFrom.SecretKeyRef).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_PASSWORD is not using SecretKeyRef") + Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") + Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databasePassword"), "Secret key does not match") + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "DATABASE_SERVICE") + Expect(envVar).NotTo(BeNil(), "Env var DATABASE_SERVICE not found") + Expect(envVar.ValueFrom).NotTo(BeNil(), "Env var DATABASE_SERVICE does not have ValueFrom set") + Expect(envVar.ValueFrom.SecretKeyRef).NotTo(BeNil(), "Env var DATABASE_SERVICE is not using SecretKeyRef") + Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") + Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databaseService"), "Secret key does not match") + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "DATABASE_PORT") + Expect(envVar).NotTo(BeNil(), "Env var DATABASE_PORT not found") + Expect(envVar.ValueFrom).NotTo(BeNil(), "Env var DATABASE_PORT does not have ValueFrom set") + Expect(envVar.ValueFrom.SecretKeyRef).NotTo(BeNil(), "Env var DATABASE_PORT is not using SecretKeyRef") + Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") + Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databasePort"), "Secret key does not match") + + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_JDBC_URL") + Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_JDBC_URL not found") + Expect(envVar.Value).To(Equal("jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/trustyai_database")) + }) + }) - Context("When deploying with a custom CA bundle ConfigMap", func() { + Context("When deploying with no custom CA bundle ConfigMap", func() { var instance *trustyaiopendatahubiov1alpha1.TrustyAIService - It("should use the correct service account and include CustomCertificatesBundle", func() { + It("should use the correct service account and not include CustomCertificatesBundle in PVC-mode", func() { - namespace := "trusty-ns-a-8" - instance = createDefaultCR(namespace) - caBundleConfigMap := createTrustedCABundleConfigMap(namespace) - Expect(createNamespace(ctx, k8sClient, namespace)).To(Succeed()) - Expect(k8sClient.Create(ctx, caBundleConfigMap)).To(Succeed()) + namespace := "trusty-ns-a-7-pvc" + instance = createDefaultPVCCustomResource(namespace) + setupAndTestDeploymentNoCustomCABundle(instance, namespace) + }) + It("should use the correct service account and not include CustomCertificatesBundle in DB-mode", func() { - caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) + namespace := "trusty-ns-a-7-db" + instance = createDefaultDBCustomResource(namespace) + setupAndTestDeploymentNoCustomCABundle(instance, namespace) + }) + It("should use the correct service account and not include CustomCertificatesBundle in migration-mode", func() { - Expect(createTestPVC(ctx, k8sClient, instance)).To(Succeed()) - Expect(reconciler.createServiceAccount(ctx, instance)).To(Succeed()) - WaitFor(func() error { - return reconciler.ensureDeployment(ctx, instance, caBundle) - }, "failed to create deployment") + namespace := "trusty-ns-a-7-migration" + instance = createDefaultMigrationCustomResource(namespace) + setupAndTestDeploymentNoCustomCABundle(instance, namespace) + }) - deployment := &appsv1.Deployment{} - namespacedName := types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace} - Expect(k8sClient.Get(ctx, namespacedName, deployment)).Should(Succeed()) + }) - Expect(deployment.Spec.Template.Spec.ServiceAccountName).To(Equal(instance.Name + "-proxy")) + Context("When deploying with a custom CA bundle ConfigMap", func() { + var instance *trustyaiopendatahubiov1alpha1.TrustyAIService - foundCustomCertificatesBundleVolumeMount := false + It("should use the correct service account and include CustomCertificatesBundle in PVC-mode", func() { - customCertificatesBundleMountPath := "/etc/ssl/certs/ca-bundle.crt" - for _, container := range deployment.Spec.Template.Spec.Containers { - for _, volumeMount := range container.VolumeMounts { - if volumeMount.Name == caBundleName && volumeMount.MountPath == customCertificatesBundleMountPath { - foundCustomCertificatesBundleVolumeMount = true - } - } - } - Expect(foundCustomCertificatesBundleVolumeMount).To(BeTrue(), caBundleName+" volume mount not found in any container") + namespace := "trusty-ns-a-8-pvc" + instance = createDefaultPVCCustomResource(namespace) + setupAndTestDeploymentCustomCABundle(instance, namespace) + }) + It("should use the correct service account and include CustomCertificatesBundle in DB-mode", func() { - Expect(k8sClient.Delete(ctx, caBundleConfigMap)).To(Succeed(), "failed to delete custom certificates bundle ConfigMap") + namespace := "trusty-ns-a-8-db" + instance = createDefaultDBCustomResource(namespace) + setupAndTestDeploymentCustomCABundle(instance, namespace) + }) + It("should use the correct service account and include CustomCertificatesBundle in migration-mode", func() { + namespace := "trusty-ns-a-8-migration" + instance = createDefaultMigrationCustomResource(namespace) + setupAndTestDeploymentCustomCABundle(instance, namespace) }) }) Context("When deploying with default settings without an InferenceService", func() { var instance *trustyaiopendatahubiov1alpha1.TrustyAIService - It("should use the correct service account", func() { + It("should use the correct service account in PVC-mode", func() { - namespace := "trusty-ns-a-6" - instance = createDefaultCR(namespace) - Expect(createNamespace(ctx, k8sClient, namespace)).To(Succeed()) + namespace := "trusty-ns-a-6-pvc" + instance = createDefaultPVCCustomResource(namespace) + setupAndTestDeploymentServiceAccount(instance, namespace, "PVC") - caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) + }) + It("should use the correct service account in DB-mode", func() { - Expect(createTestPVC(ctx, k8sClient, instance)).To(Succeed()) - Expect(reconciler.createServiceAccount(ctx, instance)).To(Succeed()) - WaitFor(func() error { - return reconciler.ensureDeployment(ctx, instance, caBundle) - }, "failed to create deployment") + namespace := "trusty-ns-a-6-db" + instance = createDefaultDBCustomResource(namespace) + setupAndTestDeploymentServiceAccount(instance, namespace, "DATABASE") - deployment := &appsv1.Deployment{} - namespacedName := types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace} - Expect(k8sClient.Get(ctx, namespacedName, deployment)).Should(Succeed()) + }) + It("should use the correct service account in migration-mode", func() { + + namespace := "trusty-ns-a-6-migration" + instance = createDefaultMigrationCustomResource(namespace) + setupAndTestDeploymentServiceAccount(instance, namespace, "DATABASE") - Expect(deployment.Spec.Template.Spec.ServiceAccountName).To(Equal(instance.Name + "-proxy")) }) }) @@ -402,72 +793,28 @@ var _ = Describe("TrustyAI operator", func() { Context("When deploying with an associated InferenceService", func() { - It("Sets up the InferenceService and links it to the TrustyAIService deployment", func() { + It("Sets up the InferenceService and links it to the TrustyAIService deployment in PVC-mode", func() { - namespace := "trusty-ns-2" - instance := createDefaultCR(namespace) - WaitFor(func() error { - return createNamespace(ctx, k8sClient, namespace) - }, "failed to create namespace") + namespace := "trusty-ns-2-pvc" + instance := createDefaultPVCCustomResource(namespace) + setupAndTestDeploymentInferenceService(instance, namespace, "PVC") - caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) - - WaitFor(func() error { - return createTestPVC(ctx, k8sClient, instance) - }, "failed to create PVC") - WaitFor(func() error { - return reconciler.ensureDeployment(ctx, instance, caBundle) - }, "failed to create deployment") - - // Creating the InferenceService - inferenceService := createInferenceService("my-model", namespace) - WaitFor(func() error { - return k8sClient.Create(ctx, inferenceService) - }, "failed to create deployment") - - Expect(reconciler.patchKServe(ctx, instance, *inferenceService, namespace, instance.Name, false)).ToNot(HaveOccurred()) - - deployment := &appsv1.Deployment{} - WaitFor(func() error { - // Define defaultServiceName for the deployment created by the operator - namespacedNamed := types.NamespacedName{ - Namespace: namespace, - Name: instance.Name, - } - return k8sClient.Get(ctx, namespacedNamed, deployment) - }, "failed to get Deployment") - - Expect(*deployment.Spec.Replicas).Should(Equal(int32(1))) - Expect(deployment.Namespace).Should(Equal(namespace)) - Expect(deployment.Name).Should(Equal(defaultServiceName)) - Expect(deployment.Labels["app"]).Should(Equal(defaultServiceName)) - Expect(deployment.Labels["app.kubernetes.io/name"]).Should(Equal(defaultServiceName)) - Expect(deployment.Labels["app.kubernetes.io/instance"]).Should(Equal(defaultServiceName)) - Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(componentName)) - Expect(deployment.Labels["app.kubernetes.io/version"]).Should(Equal(Version)) - - WaitFor(func() error { - err := reconciler.reconcileOAuthService(ctx, instance, caBundle) - return err - }, "failed to create oauth service") + }) + It("Sets up the InferenceService and links it to the TrustyAIService deployment in DB-mode", func() { - desiredOAuthService, err := generateTrustyAIOAuthService(ctx, instance, caBundle) - Expect(err).ToNot(HaveOccurred()) + namespace := "trusty-ns-2-db" + instance := createDefaultDBCustomResource(namespace) + setupAndTestDeploymentInferenceService(instance, namespace, "DATABASE") - oauthService := &corev1.Service{} - WaitFor(func() error { - return k8sClient.Get(ctx, types.NamespacedName{Name: desiredOAuthService.Name, Namespace: namespace}, oauthService) - }, "failed to get OAuth Service") + }) + It("Sets up the InferenceService and links it to the TrustyAIService deployment in migration-mode", func() { - // Check if the OAuth service has the expected labels - Expect(oauthService.Labels["app"]).Should(Equal(instance.Name)) - Expect(oauthService.Labels["app.kubernetes.io/instance"]).Should(Equal(instance.Name)) - Expect(oauthService.Labels["app.kubernetes.io/name"]).Should(Equal(instance.Name)) - Expect(oauthService.Labels["app.kubernetes.io/part-of"]).Should(Equal(componentName)) - Expect(oauthService.Labels["app.kubernetes.io/version"]).Should(Equal(Version)) - Expect(oauthService.Labels["trustyai-service-name"]).Should(Equal(instance.Name)) + namespace := "trusty-ns-2-migration" + instance := createDefaultMigrationCustomResource(namespace) + setupAndTestDeploymentInferenceService(instance, namespace, "DATABASE") }) + }) }) @@ -493,7 +840,7 @@ var _ = Describe("TrustyAI operator", func() { It("Deploys services with defaults in each specified namespace", func() { for i, namespace := range namespaces { - instances[i] = createDefaultCR(namespace) + instances[i] = createDefaultPVCCustomResource(namespace) instances[i].Namespace = namespace WaitFor(func() error { return createNamespace(ctx, k8sClient, namespace) @@ -507,7 +854,7 @@ var _ = Describe("TrustyAI operator", func() { return createTestPVC(ctx, k8sClient, instance) }, "failed to create PVC") WaitFor(func() error { - return reconciler.ensureDeployment(ctx, instance, caBundle) + return reconciler.ensureDeployment(ctx, instance, caBundle, false) }, "failed to create deployment") //Expect(k8sClient.Create(ctx, instance)).Should(Succeed()) deployment := &appsv1.Deployment{} diff --git a/controllers/monitor_test.go b/controllers/monitor_test.go index 41cd8fd..ce9bc9c 100644 --- a/controllers/monitor_test.go +++ b/controllers/monitor_test.go @@ -60,7 +60,7 @@ var _ = Describe("Service Monitor Reconciliation", func() { var instance *trustyaiopendatahubiov1alpha1.TrustyAIService It("Should have correct values", func() { namespace := "sm-test-namespace-1" - instance = createDefaultCR(namespace) + instance = createDefaultPVCCustomResource(namespace) WaitFor(func() error { return createNamespace(ctx, k8sClient, namespace) diff --git a/controllers/route.go b/controllers/route.go index 6d5963e..3016a32 100644 --- a/controllers/route.go +++ b/controllers/route.go @@ -100,6 +100,7 @@ func (r *TrustyAIServiceReconciler) checkRouteReady(ctx context.Context, cr *tru err := r.Client.Get(ctx, types.NamespacedName{Name: cr.Name, Namespace: cr.Namespace}, existingRoute) if err != nil { + log.FromContext(ctx).Info("Unable to find the Route") if errors.IsNotFound(err) { return false, nil } diff --git a/controllers/route_test.go b/controllers/route_test.go index b84f237..da35a68 100644 --- a/controllers/route_test.go +++ b/controllers/route_test.go @@ -11,6 +11,42 @@ import ( "k8s.io/client-go/tools/record" ) +func setupAndTestRouteCreation(instance *trustyaiopendatahubiov1alpha1.TrustyAIService, namespace string) { + WaitFor(func() error { + return createNamespace(ctx, k8sClient, namespace) + }, "failed to create namespace") + + err := reconciler.reconcileRouteAuth(instance, ctx, reconciler.createRouteObject) + Expect(err).ToNot(HaveOccurred()) + + route := &routev1.Route{} + err = reconciler.Client.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, route) + Expect(err).ToNot(HaveOccurred()) + Expect(route).ToNot(BeNil()) + Expect(route.Spec.To.Name).To(Equal(instance.Name + "-tls")) + +} + +func setupAndTestSameRouteCreation(instance *trustyaiopendatahubiov1alpha1.TrustyAIService, namespace string) { + WaitFor(func() error { + return createNamespace(ctx, k8sClient, namespace) + }, "failed to create namespace") + + // Create a Route with the expected spec + existingRoute, err := reconciler.createRouteObject(ctx, instance) + Expect(err).ToNot(HaveOccurred()) + Expect(reconciler.Client.Create(ctx, existingRoute)).To(Succeed()) + + err = reconciler.reconcileRouteAuth(instance, ctx, reconciler.createRouteObject) + Expect(err).ToNot(HaveOccurred()) + + // Fetch the Route + route := &routev1.Route{} + err = reconciler.Client.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, route) + Expect(err).ToNot(HaveOccurred()) + Expect(route.Spec).To(Equal(existingRoute.Spec)) +} + var _ = Describe("Route Reconciliation", func() { BeforeEach(func() { @@ -25,47 +61,41 @@ var _ = Describe("Route Reconciliation", func() { Context("When Route does not exist", func() { var instance *trustyaiopendatahubiov1alpha1.TrustyAIService - It("Should create Route successfully", func() { - namespace := "route-test-namespace-1" - instance = createDefaultCR(namespace) - - WaitFor(func() error { - return createNamespace(ctx, k8sClient, namespace) - }, "failed to create namespace") - - err := reconciler.reconcileRouteAuth(instance, ctx, reconciler.createRouteObject) - Expect(err).ToNot(HaveOccurred()) - - route := &routev1.Route{} - err = reconciler.Client.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, route) - Expect(err).ToNot(HaveOccurred()) - Expect(route).ToNot(BeNil()) - Expect(route.Spec.To.Name).To(Equal(instance.Name + "-tls")) + It("Should create Route successfully in PVC-mode", func() { + namespace := "route-test-namespace-1-pvc" + instance = createDefaultPVCCustomResource(namespace) + setupAndTestRouteCreation(instance, namespace) + }) + It("Should create Route successfully in DB-mode", func() { + namespace := "route-test-namespace-1-db" + instance = createDefaultDBCustomResource(namespace) + setupAndTestRouteCreation(instance, namespace) + }) + It("Should create Route successfully in migration-mode", func() { + namespace := "route-test-namespace-1-migration" + instance = createDefaultMigrationCustomResource(namespace) + setupAndTestRouteCreation(instance, namespace) }) + }) Context("When Route exists and is the same", func() { var instance *trustyaiopendatahubiov1alpha1.TrustyAIService - It("Should not update Route", func() { - namespace := "route-test-namespace-2" - instance = createDefaultCR(namespace) - WaitFor(func() error { - return createNamespace(ctx, k8sClient, namespace) - }, "failed to create namespace") - - // Create a Route with the expected spec - existingRoute, err := reconciler.createRouteObject(ctx, instance) - Expect(err).ToNot(HaveOccurred()) - Expect(reconciler.Client.Create(ctx, existingRoute)).To(Succeed()) - - err = reconciler.reconcileRouteAuth(instance, ctx, reconciler.createRouteObject) - Expect(err).ToNot(HaveOccurred()) - - // Fetch the Route - route := &routev1.Route{} - err = reconciler.Client.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, route) - Expect(err).ToNot(HaveOccurred()) - Expect(route.Spec).To(Equal(existingRoute.Spec)) + It("Should not update Route in PVC-mode", func() { + namespace := "route-test-namespace-2-pvc" + instance = createDefaultPVCCustomResource(namespace) + setupAndTestSameRouteCreation(instance, namespace) + }) + It("Should not update Route in DB-mode", func() { + namespace := "route-test-namespace-2-db" + instance = createDefaultDBCustomResource(namespace) + setupAndTestSameRouteCreation(instance, namespace) }) + It("Should not update Route in migration-mode", func() { + namespace := "route-test-namespace-2-migration" + instance = createDefaultMigrationCustomResource(namespace) + setupAndTestSameRouteCreation(instance, namespace) + }) + }) }) diff --git a/controllers/secrets.go b/controllers/secrets.go new file mode 100644 index 0000000..6e2d49b --- /dev/null +++ b/controllers/secrets.go @@ -0,0 +1,60 @@ +package controllers + +import ( + "context" + "fmt" + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// findDatabaseSecret finds the DB configuration secret named (specified or default) in the same namespace as the CR +func (r *TrustyAIServiceReconciler) findDatabaseSecret(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (*corev1.Secret, error) { + + databaseConfigurationsName := instance.Spec.Storage.DatabaseConfigurations + defaultDatabaseConfigurationsName := instance.Name + dbCredentialsSuffix + + secret := &corev1.Secret{} + + if databaseConfigurationsName != "" { + secret := &corev1.Secret{} + err := r.Get(ctx, client.ObjectKey{Name: databaseConfigurationsName, Namespace: instance.Namespace}, secret) + if err == nil { + return secret, nil + } + if !errors.IsNotFound(err) { + return nil, fmt.Errorf("failed to get secret %s in namespace %s: %w", databaseConfigurationsName, instance.Namespace, err) + } + } else { + // If specified not found, try the default + + err := r.Get(ctx, client.ObjectKey{Name: defaultDatabaseConfigurationsName, Namespace: instance.Namespace}, secret) + if err == nil { + return secret, nil + } + if !errors.IsNotFound(err) { + return nil, fmt.Errorf("failed to get secret %s in namespace %s: %w", defaultDatabaseConfigurationsName, instance.Namespace, err) + } + + } + + return nil, fmt.Errorf("neither secret %s nor %s found in namespace %s", databaseConfigurationsName, defaultDatabaseConfigurationsName, instance.Namespace) +} + +// validateDatabaseSecret validates the DB configuration secret +func (r *TrustyAIServiceReconciler) validateDatabaseSecret(secret *corev1.Secret) error { + + mandatoryKeys := []string{"databaseKind", "databaseUsername", "databasePassword", "databaseService", "databasePort"} + + for _, key := range mandatoryKeys { + value, exists := secret.Data[key] + if !exists { + return fmt.Errorf("mandatory key %s is missing from database configuration", key) + } + if len(value) == 0 { + return fmt.Errorf("mandatory key %s is empty on database configuration", key) + } + } + return nil +} diff --git a/controllers/service_accounts_test.go b/controllers/service_accounts_test.go index 4b64837..17b9d66 100644 --- a/controllers/service_accounts_test.go +++ b/controllers/service_accounts_test.go @@ -26,7 +26,7 @@ var _ = Describe("Service Accounts", func() { It("Should create SAs, CRBs successfully", func() { namespace1 := "sa-test-namespace-1" - instance1 := createDefaultCR(namespace1) + instance1 := createDefaultPVCCustomResource(namespace1) WaitFor(func() error { return createNamespace(ctx, k8sClient, namespace1) @@ -36,7 +36,7 @@ var _ = Describe("Service Accounts", func() { namespace2 := "sa-test-namespace-2" - instance2 := createDefaultCR(namespace2) + instance2 := createDefaultPVCCustomResource(namespace2) WaitFor(func() error { return createNamespace(ctx, k8sClient, namespace2) diff --git a/controllers/statuses.go b/controllers/statuses.go index b54036a..b88ba47 100644 --- a/controllers/statuses.go +++ b/controllers/statuses.go @@ -2,6 +2,7 @@ package controllers import ( "context" + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" v1 "k8s.io/api/core/v1" "k8s.io/client-go/util/retry" @@ -10,12 +11,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" ) -// IsAllReady checks if all the necessary readiness fields are true. -func (rs *AvailabilityStatus) IsAllReady() bool { - return rs.PVCReady && rs.DeploymentReady && rs.RouteReady +// IsAllReady checks if all the necessary readiness fields are true for the specific mode +func (rs *AvailabilityStatus) IsAllReady(mode string) bool { + return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) || (rs.DeploymentReady && rs.RouteReady && mode == STORAGE_DATABASE) } -// AvailabilityStatus holds the readiness status of various resources. +// AvailabilityStatus has the readiness status of various resources. type AvailabilityStatus struct { PVCReady bool DeploymentReady bool @@ -44,37 +45,39 @@ func (r *TrustyAIServiceReconciler) updateStatus(ctx context.Context, original * return saved, err } -// reconcileStatuses checks the readiness status of PVC, Deployment, Route and Inference Services +// reconcileStatuses checks the readiness status of required resources func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (ctrl.Result, error) { var err error status := AvailabilityStatus{} - // Check for PVC readiness - status.PVCReady, err = r.checkPVCReady(ctx, instance) - if err != nil || !status.PVCReady { - // PVC not ready, requeue - return Requeue() + if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() { + // Check for PVC readiness + status.PVCReady, err = r.checkPVCReady(ctx, instance) + if err != nil || !status.PVCReady { + // PVC not ready, requeue + return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "PVC not ready") + } } // Check for deployment readiness status.DeploymentReady, err = r.checkDeploymentReady(ctx, instance) if err != nil || !status.DeploymentReady { // Deployment not ready, requeue - return Requeue() + return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Deployment not ready") } // Check for route readiness status.RouteReady, err = r.checkRouteReady(ctx, instance) if err != nil || !status.RouteReady { // Route not ready, requeue - return Requeue() + return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Route not ready") } // Check if InferenceServices present status.InferenceServiceReady, err = r.checkInferenceServicesPresent(ctx, instance.Namespace) // All checks passed, resources are ready - if status.IsAllReady() { + if status.IsAllReady(instance.Spec.Storage.Format) { _, updateErr := r.updateStatus(ctx, instance, func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { if status.InferenceServiceReady { @@ -83,7 +86,9 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta UpdateInferenceServiceNotPresent(saved) } - UpdatePVCAvailable(saved) + if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() { + UpdatePVCAvailable(saved) + } UpdateRouteAvailable(saved) UpdateTrustyAIServiceAvailable(saved) saved.Status.Phase = "Ready" @@ -101,11 +106,14 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta UpdateInferenceServiceNotPresent(saved) } - if status.PVCReady { - UpdatePVCAvailable(saved) - } else { - UpdatePVCNotAvailable(saved) + if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() { + if status.PVCReady { + UpdatePVCAvailable(saved) + } else { + UpdatePVCNotAvailable(saved) + } } + if status.RouteReady { UpdateRouteAvailable(saved) } else { diff --git a/controllers/statuses_test.go b/controllers/statuses_test.go index 6d11081..9e395df 100644 --- a/controllers/statuses_test.go +++ b/controllers/statuses_test.go @@ -3,6 +3,7 @@ package controllers import ( "context" "fmt" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" @@ -26,6 +27,27 @@ func checkCondition(conditions []trustyaiopendatahubiov1alpha1.Condition, condit return nil, false, fmt.Errorf("%s condition not found", conditionType) } +func setupAndTestStatusNoComponent(instance *trustyaiopendatahubiov1alpha1.TrustyAIService, namespace string) { + WaitFor(func() error { + return createNamespace(ctx, k8sClient, namespace) + }, "failed to create namespace") + + // Call the reconcileStatuses function + _, _ = reconciler.reconcileStatuses(ctx, instance) + + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") + if readyCondition != nil { + Expect(statusMatch).To(Equal(corev1.ConditionFalse), "Ready condition should be true") + } + + availableCondition, statusMatch, err := checkCondition(instance.Status.Conditions, StatusTypeAvailable, corev1.ConditionFalse, true) + Expect(err).NotTo(HaveOccurred(), "Error checking Available condition") + if availableCondition != nil { + Expect(statusMatch).To(Equal(corev1.ConditionFalse), "Available condition should be false") + } +} + var _ = Describe("Status and condition tests", func() { BeforeEach(func() { @@ -41,37 +63,163 @@ var _ = Describe("Status and condition tests", func() { Context("When no component exists", func() { var instance *trustyaiopendatahubiov1alpha1.TrustyAIService - It("Should not be available", func() { - namespace := "statuses-test-namespace-1" - instance = createDefaultCR(namespace) + It("Should not be available in PVC-mode", func() { + namespace := "statuses-test-namespace-1-pvc" + instance = createDefaultPVCCustomResource(namespace) + setupAndTestStatusNoComponent(instance, namespace) + }) + It("Should not be available in DB-mode", func() { + namespace := "statuses-test-namespace-1-db" + instance = createDefaultDBCustomResource(namespace) + setupAndTestStatusNoComponent(instance, namespace) + }) + It("Should not be available in migration-mode", func() { + namespace := "statuses-test-namespace-1-migration" + instance = createDefaultMigrationCustomResource(namespace) + setupAndTestStatusNoComponent(instance, namespace) + }) + + }) + Context("When route, deployment and PVC component, but not inference service, exist", func() { + var instance *trustyaiopendatahubiov1alpha1.TrustyAIService + It("Should be available in PVC-mode", func() { + namespace := "statuses-test-namespace-2-pvc" + instance = createDefaultPVCCustomResource(namespace) WaitFor(func() error { return createNamespace(ctx, k8sClient, namespace) }, "failed to create namespace") + caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) + + WaitFor(func() error { + return reconciler.reconcileRouteAuth(instance, ctx, reconciler.createRouteObject) + }, "failed to create route") + WaitFor(func() error { + return makeRouteReady(ctx, k8sClient, instance) + }, "failed to make route ready") + WaitFor(func() error { + return reconciler.ensurePVC(ctx, instance) + }, "failed to create PVC") + WaitFor(func() error { + return makePVCReady(ctx, k8sClient, instance) + }, "failed to bind PVC") + WaitFor(func() error { + return reconciler.ensureDeployment(ctx, instance, caBundle, false) + }, "failed to create deployment") + WaitFor(func() error { + return makeDeploymentReady(ctx, k8sClient, instance) + }, "failed to make deployment ready") + WaitFor(func() error { + return k8sClient.Create(ctx, instance) + }, "failed to create TrustyAIService") // Call the reconcileStatuses function - _, _ = reconciler.reconcileStatuses(ctx, instance) + WaitFor(func() error { + _, err := reconciler.reconcileStatuses(ctx, instance) + return err + }, "failed to update statuses") + + // Fetch the updated instance + WaitFor(func() error { + return k8sClient.Get(ctx, types.NamespacedName{ + Name: instance.Name, + Namespace: instance.Namespace, + }, instance) + }, "failed to get updated instance") readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { - Expect(statusMatch).To(Equal(corev1.ConditionFalse), "Ready condition should be true") + Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") } - availableCondition, statusMatch, err := checkCondition(instance.Status.Conditions, StatusTypeAvailable, corev1.ConditionFalse, true) + availableCondition, statusMatch, err := checkCondition(instance.Status.Conditions, StatusTypeAvailable, corev1.ConditionTrue, false) Expect(err).NotTo(HaveOccurred(), "Error checking Available condition") - if availableCondition != nil { - Expect(statusMatch).To(Equal(corev1.ConditionFalse), "Available condition should be false") - } + Expect(availableCondition).NotTo(BeNil(), "Available condition should not be null") + Expect(statusMatch).To(Equal(true), "Ready condition should be true") + + routeAvailableCondition, statusMatch, err := checkCondition(instance.Status.Conditions, StatusTypeRouteAvailable, corev1.ConditionTrue, false) + Expect(err).NotTo(HaveOccurred(), "Error checking RouteAvailable condition") + Expect(routeAvailableCondition).NotTo(BeNil(), "RouteAvailable condition should not be null") + Expect(statusMatch).To(Equal(true), "RouteAvailable condition should be true") + + pvcAvailableCondition, statusMatch, err := checkCondition(instance.Status.Conditions, StatusTypePVCAvailable, corev1.ConditionTrue, false) + Expect(err).NotTo(HaveOccurred(), "Error checking PVCAvailable condition") + Expect(pvcAvailableCondition).NotTo(BeNil(), "PVCAvailable condition should not be null") + Expect(statusMatch).To(Equal(true), "PVCAvailable condition should be true") + ISPresentCondition, statusMatch, err := checkCondition(instance.Status.Conditions, StatusTypeInferenceServicesPresent, corev1.ConditionFalse, false) + Expect(err).NotTo(HaveOccurred(), "Error checking InferenceServicePresent condition") + Expect(ISPresentCondition).NotTo(BeNil(), "InferenceServicePresent condition should not be null") + Expect(statusMatch).To(Equal(true), "InferenceServicePresent condition should be false") }) - }) + It("Should be available in DB-mode", func() { + namespace := "statuses-test-namespace-2-db" + instance = createDefaultDBCustomResource(namespace) + WaitFor(func() error { + return createNamespace(ctx, k8sClient, namespace) + }, "failed to create namespace") + caBundle := reconciler.GetCustomCertificatesBundle(ctx, instance) - Context("When route, deployment and PVC component, but not inference service, exist", func() { - var instance *trustyaiopendatahubiov1alpha1.TrustyAIService - It("Should be available", func() { - namespace := "statuses-test-namespace-2" - instance = createDefaultCR(namespace) + WaitFor(func() error { + return reconciler.reconcileRouteAuth(instance, ctx, reconciler.createRouteObject) + }, "failed to create route") + WaitFor(func() error { + return makeRouteReady(ctx, k8sClient, instance) + }, "failed to make route ready") + WaitFor(func() error { + return reconciler.ensureDeployment(ctx, instance, caBundle, false) + }, "failed to create deployment") + WaitFor(func() error { + return makeDeploymentReady(ctx, k8sClient, instance) + }, "failed to make deployment ready") + WaitFor(func() error { + return k8sClient.Create(ctx, instance) + }, "failed to create TrustyAIService") + + // Call the reconcileStatuses function + WaitFor(func() error { + _, err := reconciler.reconcileStatuses(ctx, instance) + return err + }, "failed to update statuses") + + // Fetch the updated instance + WaitFor(func() error { + return k8sClient.Get(ctx, types.NamespacedName{ + Name: instance.Name, + Namespace: instance.Namespace, + }, instance) + }, "failed to get updated instance") + + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") + if readyCondition != nil { + Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") + } + + availableCondition, statusMatch, err := checkCondition(instance.Status.Conditions, StatusTypeAvailable, corev1.ConditionTrue, false) + Expect(err).NotTo(HaveOccurred(), "Error checking Available condition") + Expect(availableCondition).NotTo(BeNil(), "Available condition should not be null") + Expect(statusMatch).To(Equal(true), "Ready condition should be true") + + routeAvailableCondition, statusMatch, err := checkCondition(instance.Status.Conditions, StatusTypeRouteAvailable, corev1.ConditionTrue, false) + Expect(err).NotTo(HaveOccurred(), "Error checking RouteAvailable condition") + Expect(routeAvailableCondition).NotTo(BeNil(), "RouteAvailable condition should not be null") + Expect(statusMatch).To(Equal(true), "RouteAvailable condition should be true") + + pvcAvailableCondition, _, err := checkCondition(instance.Status.Conditions, StatusTypePVCAvailable, corev1.ConditionTrue, false) + Expect(err).To(HaveOccurred(), "Error checking PVCAvailable condition") + Expect(pvcAvailableCondition).To(BeNil(), "PVCAvailable condition should be null") + + ISPresentCondition, statusMatch, err := checkCondition(instance.Status.Conditions, StatusTypeInferenceServicesPresent, corev1.ConditionFalse, false) + Expect(err).NotTo(HaveOccurred(), "Error checking InferenceServicePresent condition") + Expect(ISPresentCondition).NotTo(BeNil(), "InferenceServicePresent condition should not be null") + Expect(statusMatch).To(Equal(true), "InferenceServicePresent condition should be false") + + }) + It("Should be available in migration-mode", func() { + namespace := "statuses-test-namespace-2-migration" + instance = createDefaultMigrationCustomResource(namespace) WaitFor(func() error { return createNamespace(ctx, k8sClient, namespace) }, "failed to create namespace") @@ -90,7 +238,7 @@ var _ = Describe("Status and condition tests", func() { return makePVCReady(ctx, k8sClient, instance) }, "failed to bind PVC") WaitFor(func() error { - return reconciler.ensureDeployment(ctx, instance, caBundle) + return reconciler.ensureDeployment(ctx, instance, caBundle, false) }, "failed to create deployment") WaitFor(func() error { return makeDeploymentReady(ctx, k8sClient, instance) @@ -138,6 +286,7 @@ var _ = Describe("Status and condition tests", func() { Expect(err).NotTo(HaveOccurred(), "Error checking InferenceServicePresent condition") Expect(ISPresentCondition).NotTo(BeNil(), "InferenceServicePresent condition should not be null") Expect(statusMatch).To(Equal(true), "InferenceServicePresent condition should be false") + }) }) @@ -145,7 +294,7 @@ var _ = Describe("Status and condition tests", func() { var instance *trustyaiopendatahubiov1alpha1.TrustyAIService It("Should be available", func() { namespace := "statuses-test-namespace-2" - instance = createDefaultCR(namespace) + instance = createDefaultPVCCustomResource(namespace) WaitFor(func() error { return createNamespace(ctx, k8sClient, namespace) }, "failed to create namespace") @@ -164,7 +313,7 @@ var _ = Describe("Status and condition tests", func() { return makePVCReady(ctx, k8sClient, instance) }, "failed to bind PVC") WaitFor(func() error { - return reconciler.ensureDeployment(ctx, instance, caBundle) + return reconciler.ensureDeployment(ctx, instance, caBundle, false) }, "failed to create deployment") WaitFor(func() error { return makeDeploymentReady(ctx, k8sClient, instance) diff --git a/controllers/storage_test.go b/controllers/storage_test.go index f11e639..c579903 100644 --- a/controllers/storage_test.go +++ b/controllers/storage_test.go @@ -31,7 +31,7 @@ var _ = Describe("PVC Reconciliation", func() { var instance *trustyaiopendatahubiov1alpha1.TrustyAIService It("should create a new PVC and emit an event", func() { namespace := "pvc-test-namespace-1" - instance = createDefaultCR(namespace) + instance = createDefaultPVCCustomResource(namespace) WaitFor(func() error { return createNamespace(ctx, k8sClient, namespace) }, "failed to create namespace") @@ -54,7 +54,7 @@ var _ = Describe("PVC Reconciliation", func() { var instance *trustyaiopendatahubiov1alpha1.TrustyAIService It("should not attempt to create the PVC", func() { namespace := "pvc-test-namespace-2" - instance = createDefaultCR(namespace) + instance = createDefaultPVCCustomResource(namespace) WaitFor(func() error { return createNamespace(ctx, k8sClient, namespace) }, "failed to create namespace") @@ -77,4 +77,16 @@ var _ = Describe("PVC Reconciliation", func() { }) }) + Context("when a migration CR is made", func() { + var instance *trustyaiopendatahubiov1alpha1.TrustyAIService + It("Check all fields are correct", func() { + namespace := "pvc-test-namespace-3" + instance = createDefaultMigrationCustomResource(namespace) + + Expect(instance.IsMigration()).To(BeTrue()) + Expect(instance.Spec.Storage.IsStoragePVC()).To(BeFalse()) + Expect(instance.Spec.Storage.IsStorageDatabase()).To(BeTrue()) + }) + }) + }) diff --git a/controllers/suite_test.go b/controllers/suite_test.go index ec35b5f..7ee75a9 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -65,8 +65,9 @@ var ( ) const ( - defaultServiceName = "example-trustyai-service" - operatorNamespace = "system" + defaultServiceName = "example-trustyai-service" + defaultDatabaseConfigurationName = defaultServiceName + "-db-credentials" + operatorNamespace = "system" ) const ( @@ -86,8 +87,8 @@ func WaitFor(operation func() error, errorMsg string) { Eventually(operation, defaultTimeout, defaultPolling).Should(Succeed(), errorMsg) } -// createDefaultCR creates a TrustyAIService instance with default values -func createDefaultCR(namespaceCurrent string) *trustyaiopendatahubiov1alpha1.TrustyAIService { +// createDefaultPVCCustomResource creates a TrustyAIService instance with default values and PVC backend +func createDefaultPVCCustomResource(namespaceCurrent string) *trustyaiopendatahubiov1alpha1.TrustyAIService { service := trustyaiopendatahubiov1alpha1.TrustyAIService{ ObjectMeta: metav1.ObjectMeta{ Name: defaultServiceName, @@ -96,7 +97,7 @@ func createDefaultCR(namespaceCurrent string) *trustyaiopendatahubiov1alpha1.Tru }, Spec: trustyaiopendatahubiov1alpha1.TrustyAIServiceSpec{ Storage: trustyaiopendatahubiov1alpha1.StorageSpec{ - Format: "PVC", + Format: STORAGE_PVC, Folder: "/data", Size: "1Gi", }, @@ -112,6 +113,54 @@ func createDefaultCR(namespaceCurrent string) *trustyaiopendatahubiov1alpha1.Tru return &service } +// createDefaultDBCustomResource creates a TrustyAIService instance with default values and DB backend +func createDefaultDBCustomResource(namespaceCurrent string) *trustyaiopendatahubiov1alpha1.TrustyAIService { + service := trustyaiopendatahubiov1alpha1.TrustyAIService{ + ObjectMeta: metav1.ObjectMeta{ + Name: defaultServiceName, + Namespace: namespaceCurrent, + UID: types.UID(uuid.New().String()), + }, + Spec: trustyaiopendatahubiov1alpha1.TrustyAIServiceSpec{ + Storage: trustyaiopendatahubiov1alpha1.StorageSpec{ + Format: STORAGE_DATABASE, + DatabaseConfigurations: defaultDatabaseConfigurationName, + }, + Metrics: trustyaiopendatahubiov1alpha1.MetricsSpec{ + Schedule: "5s", + }, + }, + } + return &service +} + +// createDefaultMigrationCustomResource creates a TrustyAIService instance with default values and both PVC and DB backend +func createDefaultMigrationCustomResource(namespaceCurrent string) *trustyaiopendatahubiov1alpha1.TrustyAIService { + service := trustyaiopendatahubiov1alpha1.TrustyAIService{ + ObjectMeta: metav1.ObjectMeta{ + Name: defaultServiceName, + Namespace: namespaceCurrent, + UID: types.UID(uuid.New().String()), + }, + Spec: trustyaiopendatahubiov1alpha1.TrustyAIServiceSpec{ + Storage: trustyaiopendatahubiov1alpha1.StorageSpec{ + Format: STORAGE_DATABASE, + DatabaseConfigurations: defaultDatabaseConfigurationName, + Folder: "/data", + Size: "1Gi", + }, + Data: trustyaiopendatahubiov1alpha1.DataSpec{ + Filename: "data.csv", + Format: "CSV", + }, + Metrics: trustyaiopendatahubiov1alpha1.MetricsSpec{ + Schedule: "5s", + }, + }, + } + return &service +} + // createNamespace creates a new namespace func createNamespace(ctx context.Context, k8sClient client.Client, namespace string) error { ns := &corev1.Namespace{ @@ -148,6 +197,34 @@ func createConfigMap(namespace string, oauthImage string, trustyaiServiceImage s } } +// createSecret creates a secret in the specified namespace +func createSecret(namespace string, secretName string, data map[string]string) *corev1.Secret { + // Convert the data map values from string to byte array + byteData := make(map[string][]byte) + for key, value := range data { + byteData[key] = []byte(value) + } + + // Define the Secret with the necessary data + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: secretName, + Namespace: namespace, + }, + Data: byteData, + } +} + +func createDatabaseConfiguration(namespace string, name string, dbKind string) *corev1.Secret { + return createSecret(namespace, name, map[string]string{ + "databaseKind": dbKind, + "databaseUsername": "foo", + "databasePassword": "bar", + "databaseService": "mariadb-service", + "databasePort": "3306", + }) +} + // createTrustedCABundleConfigMap creates a ConfigMap in the specified namespace // with the label to inject the trusted CA bundle by OpenShift func createTrustedCABundleConfigMap(namespace string) *corev1.ConfigMap { diff --git a/controllers/templates/service/deployment.tmpl.yaml b/controllers/templates/service/deployment.tmpl.yaml index b9f2392..840421f 100644 --- a/controllers/templates/service/deployment.tmpl.yaml +++ b/controllers/templates/service/deployment.tmpl.yaml @@ -12,6 +12,11 @@ metadata: app.kubernetes.io/part-of: trustyai app.kubernetes.io/version: {{ .Version }} spec: + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 0 + maxSurge: 1 replicas: 1 selector: matchLabels: @@ -38,25 +43,85 @@ spec: - name: trustyai-service image: {{ .ServiceImage }} env: - - name: STORAGE_DATA_FILENAME - value: {{ .Instance.Spec.Data.Filename }} - name: SERVICE_STORAGE_FORMAT value: {{ .Instance.Spec.Storage.Format }} + {{ if eq .Instance.Spec.Storage.Format "PVC" }} + - name: STORAGE_DATA_FILENAME + value: {{ .Instance.Spec.Data.Filename }} + - name: STORAGE_DATA_FOLDER + value: {{ .Instance.Spec.Storage.Folder }} + - name: SERVICE_DATA_FORMAT + value: {{ .Instance.Spec.Data.Format }} + - name: QUARKUS_HIBERNATE_ORM_ACTIVE + value: false + {{ end }} + {{ if .Instance.IsMigration }} + - name: STORAGE_DATA_FILENAME + value: {{ .Instance.Spec.Data.Filename }} - name: STORAGE_DATA_FOLDER value: {{ .Instance.Spec.Storage.Folder }} - name: SERVICE_DATA_FORMAT value: {{ .Instance.Spec.Data.Format }} + {{ end }} + {{ if eq .Instance.Spec.Storage.Format "DATABASE" }} + - name: QUARKUS_HIBERNATE_ORM_ACTIVE + value: true + - name: QUARKUS_DATASOURCE_DB_KIND + valueFrom: + secretKeyRef: + name: {{ .Instance.Spec.Storage.DatabaseConfigurations }} + key: databaseKind + - name: QUARKUS_DATASOURCE_JDBC_MAX_SIZE + value: 16 + - name: QUARKUS_DATASOURCE_USERNAME + valueFrom: + secretKeyRef: + name: {{ .Instance.Spec.Storage.DatabaseConfigurations }} + key: databaseUsername + - name: QUARKUS_DATASOURCE_PASSWORD + valueFrom: + secretKeyRef: + name: {{ .Instance.Spec.Storage.DatabaseConfigurations }} + key: databasePassword + - name: DATABASE_SERVICE + valueFrom: + secretKeyRef: + name: {{ .Instance.Spec.Storage.DatabaseConfigurations }} + key: databaseService + - name: DATABASE_PORT + valueFrom: + secretKeyRef: + name: {{ .Instance.Spec.Storage.DatabaseConfigurations }} + key: databasePort + - name: QUARKUS_DATASOURCE_JDBC_URL + value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/trustyai_database" + - name: SERVICE_DATA_FORMAT + value: "HIBERNATE" + - name: QUARKUS_DATASOURCE_GENERATION + valueFrom: + secretKeyRef: + name: {{ .Instance.Spec.Storage.DatabaseConfigurations }} + key: databaseGeneration + {{ end }} - name: SERVICE_METRICS_SCHEDULE value: {{ .Instance.Spec.Metrics.Schedule }} - name: SERVICE_BATCH_SIZE - value: {{ .Schedule }} + value: {{ .BatchSize }} + {{ if .Instance.IsMigration }} + - name: STORAGE_MIGRATION_CONFIG_FROM_FOLDER + value: {{ .Instance.Spec.Storage.Folder }} + - name: STORAGE_MIGRATION_CONFIG_FROM_FILENAME + value: {{ .Instance.Spec.Data.Filename }} + {{ end }} volumeMounts: - name: {{ .Instance.Name }}-internal readOnly: false mountPath: /etc/tls/internal + {{ if or (eq .Instance.Spec.Storage.Format "PVC") (.Instance.IsMigration) }} - name: {{ .VolumeMountName }} mountPath: {{ .Instance.Spec.Storage.Folder }} readOnly: false + {{ end }} - resources: limits: cpu: 100m @@ -125,9 +190,11 @@ spec: "pods", "verb": "get"}} serviceAccount: {{ .Instance.Name }}-proxy volumes: + {{ if or (eq .Instance.Spec.Storage.Format "PVC") ( .Instance.IsMigration) }} - name: volume persistentVolumeClaim: claimName: {{ .PVCClaimName }} + {{ end }} {{ if .CustomCertificatesBundle.IsDefined }} - name: {{ .CustomCertificatesBundle.VolumeName}} configMap: diff --git a/controllers/trustyaiservice_controller.go b/controllers/trustyaiservice_controller.go index 00600da..d37fb26 100644 --- a/controllers/trustyaiservice_controller.go +++ b/controllers/trustyaiservice_controller.go @@ -136,25 +136,57 @@ func (r *TrustyAIServiceReconciler) Reconcile(ctx context.Context, req ctrl.Requ return RequeueWithDelayMessage(ctx, time.Minute, "Not all replicas are ready, requeue the reconcile request") } - // Ensure PVC - err = r.ensurePVC(ctx, instance) - if err != nil { - // PVC not found condition - log.FromContext(ctx).Error(err, "Error creating PVC storage.") - _, updateErr := r.updateStatus(ctx, instance, UpdatePVCNotAvailable) - if updateErr != nil { - return RequeueWithErrorMessage(ctx, err, "Failed to update status") - } + if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() { + // Ensure PVC + err = r.ensurePVC(ctx, instance) + if err != nil { + // PVC not found condition + log.FromContext(ctx).Error(err, "Error creating PVC storage.") + _, updateErr := r.updateStatus(ctx, instance, UpdatePVCNotAvailable) + if updateErr != nil { + return RequeueWithErrorMessage(ctx, err, "Failed to update status") + } - // If there was an error finding the PV, requeue the request - return RequeueWithErrorMessage(ctx, err, "Could not find requested PersistentVolumeClaim.") + // If there was an error finding the PV, requeue the request + return RequeueWithErrorMessage(ctx, err, "Could not find requested PersistentVolumeClaim.") + } + } + if instance.Spec.Storage.IsStorageDatabase() { + // Get database configuration + secret, err := r.findDatabaseSecret(ctx, instance) + if err != nil { + return RequeueWithErrorMessage(ctx, err, "Service configured to use database storage but no database configuration found.") + } + err = r.validateDatabaseSecret(secret) + if err != nil { + return RequeueWithErrorMessage(ctx, err, "Database configuration contains errors.") + } } - // Ensure Deployment object - err = r.ensureDeployment(ctx, instance, caBundle) - if err != nil { - return RequeueWithError(err) + // Check for migration annotation + if _, ok := instance.Annotations[migrationAnnotationKey]; ok { + log.FromContext(ctx).Info("Found migration annotation. Migrating.") + err = r.ensureDeployment(ctx, instance, caBundle, true) + //err = r.redeployForMigration(ctx, instance) + + if err != nil { + return RequeueWithErrorMessage(ctx, err, "Retrying to restart deployment during migration.") + } + + // Remove the migration annotation after processing to avoid restarts + delete(instance.Annotations, migrationAnnotationKey) + log.FromContext(ctx).Info("Deleting annotation") + if err := r.Update(ctx, instance); err != nil { + return RequeueWithErrorMessage(ctx, err, "Failed to remove migration annotation.") + } + } else { + // Ensure Deployment object + err = r.ensureDeployment(ctx, instance, caBundle, false) + log.FromContext(ctx).Info("No annotation found") + if err != nil { + return RequeueWithError(err) + } } // Fetch the TrustyAIService instance @@ -235,6 +267,7 @@ func (r *TrustyAIServiceReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&trustyaiopendatahubiov1alpha1.TrustyAIService{}). + Owns(&appsv1.Deployment{}). Watches(&source.Kind{Type: &kservev1beta1.InferenceService{}}, &handler.EnqueueRequestForObject{}). Watches(&source.Kind{Type: &kservev1alpha1.ServingRuntime{}}, &handler.EnqueueRequestForObject{}). Complete(r) From e9a0204b5df2c90e49473c58ffae3b2e78a21d61 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Mon, 22 Jul 2024 10:40:35 +0100 Subject: [PATCH 02/24] feat: Add TLS certificate mount on ModelMesh (#255) * feat: Add TLS certificate mount on ModelMesh * Revert from http to https until https://github.com/kserve/modelmesh/pull/147 is merged --- controllers/certificates.go | 31 ++++++++++++++++ controllers/constants.go | 2 + controllers/inference_services.go | 62 +++++++++++++++++++++++++++++-- controllers/utils.go | 5 ++- 4 files changed, 95 insertions(+), 5 deletions(-) diff --git a/controllers/certificates.go b/controllers/certificates.go index 7a3fc1f..16faa1f 100644 --- a/controllers/certificates.go +++ b/controllers/certificates.go @@ -3,10 +3,41 @@ package controllers import ( "context" trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" + corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" ) +const ( + tlsMountPath = "/etc/trustyai/tls" +) + +// TLSCertVolumes holds the volume and volume mount for the TLS certificates +type TLSCertVolumes struct { + volume corev1.Volume + volumeMount corev1.VolumeMount +} + +// createFor creates the required volumes and volume mount for the TLS certificates for a specific Kubernetes secret +func (cert *TLSCertVolumes) createFor(instance *trustyaiopendatahubiov1alpha1.TrustyAIService) { + volume := corev1.Volume{ + Name: instance.Name + "-internal", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: instance.Name + "-internal", + }, + }, + } + + volumeMount := corev1.VolumeMount{ + Name: instance.Name + "-internal", + MountPath: tlsMountPath, + ReadOnly: true, + } + cert.volume = volume + cert.volumeMount = volumeMount +} + func (r *TrustyAIServiceReconciler) GetCustomCertificatesBundle(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) CustomCertificatesBundle { var customCertificatesBundle CustomCertificatesBundle diff --git a/controllers/constants.go b/controllers/constants.go index f55da3a..1fc4eb9 100644 --- a/controllers/constants.go +++ b/controllers/constants.go @@ -9,6 +9,8 @@ const ( serviceMonitorName = "trustyai-metrics" finalizerName = "trustyai.opendatahub.io/finalizer" payloadProcessorName = "MM_PAYLOAD_PROCESSORS" + tlsKeyCertPathName = "MM_TLS_KEY_CERT_PATH" + mmContainerName = "mm" modelMeshLabelKey = "modelmesh-service" modelMeshLabelValue = "modelmesh-serving" volumeMountName = "volume" diff --git a/controllers/inference_services.go b/controllers/inference_services.go index bde2955..773d492 100644 --- a/controllers/inference_services.go +++ b/controllers/inference_services.go @@ -13,6 +13,10 @@ import ( ) func (r *TrustyAIServiceReconciler) patchEnvVarsForDeployments(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService, deployments []appsv1.Deployment, envVarName string, url string, remove bool) (bool, error) { + // Create volume and volume mount for this intance's TLS secrets + certVolumes := TLSCertVolumes{} + certVolumes.createFor(instance) + // Loop over the Deployments for _, deployment := range deployments { @@ -23,8 +27,31 @@ func (r *TrustyAIServiceReconciler) patchEnvVarsForDeployments(ctx context.Conte return false, nil } + // If the secret volume doesn't exist, add it + volumeExists := false + for _, vol := range deployment.Spec.Template.Spec.Volumes { + if vol.Name == instance.Name+"-internal" { + volumeExists = true + break + } + } + if !volumeExists { + deployment.Spec.Template.Spec.Volumes = append(deployment.Spec.Template.Spec.Volumes, certVolumes.volume) + } + // Loop over all containers in the Deployment's Pod template for i := range deployment.Spec.Template.Spec.Containers { + mountExists := false + for _, mount := range deployment.Spec.Template.Spec.Containers[i].VolumeMounts { + if mount.Name == instance.Name+"-internal" { + mountExists = true + break + } + } + if !mountExists { + deployment.Spec.Template.Spec.Containers[i].VolumeMounts = append(deployment.Spec.Template.Spec.Containers[i].VolumeMounts, certVolumes.volumeMount) + } + // Store the original environment variable list // Get the existing env var var envVar *corev1.EnvVar @@ -50,14 +77,17 @@ func (r *TrustyAIServiceReconciler) patchEnvVarsForDeployments(ctx context.Conte } else if envVar != nil { // If the env var exists and already contains the value, don't do anything existingValues := strings.Split(envVar.Value, " ") + valueExists := false for _, v := range existingValues { if v == url { - continue + valueExists = true + break } } - // Modify the existing env var based on the remove flag and current value - envVar.Value = generateEnvVarValue(envVar.Value, url, remove) + if !valueExists { + envVar.Value = generateEnvVarValue(envVar.Value, url, remove) + } } // Only update the deployment if the var value has to change, or we are removing it @@ -70,6 +100,32 @@ func (r *TrustyAIServiceReconciler) patchEnvVarsForDeployments(ctx context.Conte r.eventModelMeshConfigured(instance) log.FromContext(ctx).Info("Updating Deployment " + deployment.Name + ", container spec " + deployment.Spec.Template.Spec.Containers[i].Name + ", env var " + envVarName + " to " + url) } + + // Check TLS environment variable on ModelMesh + if deployment.Spec.Template.Spec.Containers[i].Name == mmContainerName { + tlsKeyCertPathEnvValue := tlsMountPath + "/tls.crt" + tlsKeyCertPathExists := false + for _, envVar := range deployment.Spec.Template.Spec.Containers[i].Env { + if envVar.Name == tlsKeyCertPathName { + tlsKeyCertPathExists = true + break + } + } + + // Doesn't exist, so we can add + if !tlsKeyCertPathExists { + deployment.Spec.Template.Spec.Containers[i].Env = append(deployment.Spec.Template.Spec.Containers[i].Env, corev1.EnvVar{ + Name: tlsKeyCertPathName, + Value: tlsKeyCertPathEnvValue, + }) + + if err := r.Update(ctx, &deployment); err != nil { + log.FromContext(ctx).Error(err, "Could not update Deployment", "Deployment", deployment.Name) + return false, err + } + log.FromContext(ctx).Info("Added environment variable " + tlsKeyCertPathName + " to deployment " + deployment.Name + " for container " + mmContainerName) + } + } } } diff --git a/controllers/utils.go b/controllers/utils.go index 4a8415e..91ee9bd 100644 --- a/controllers/utils.go +++ b/controllers/utils.go @@ -2,9 +2,10 @@ package controllers import ( "context" + "os" + appsv1 "k8s.io/api/apps/v1" "k8s.io/apimachinery/pkg/labels" - "os" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" ) @@ -63,5 +64,5 @@ func (r *TrustyAIServiceReconciler) GetDeploymentsByLabel(ctx context.Context, n // generateServiceURL generates an internal URL for a TrustyAI service func generateServiceURL(crName string, namespace string) string { - return "http://" + crName + "." + namespace + ".svc.cluster.local" + return "http://" + crName + "." + namespace + ".svc" } From c4ef8021149a6c9facba7e7549d9b9689232e1f0 Mon Sep 17 00:00:00 2001 From: Rob Geada Date: Mon, 29 Jul 2024 10:34:16 +0100 Subject: [PATCH 03/24] Pin oc version, ubi version (#263) --- tests/Dockerfile | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/tests/Dockerfile b/tests/Dockerfile index 9465467..5677c36 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -1,18 +1,17 @@ -FROM registry.access.redhat.com/ubi8:8.10-901.1716497712 +FROM registry.access.redhat.com/ubi8:8.10-1020 ARG ORG=trustyai-explainability ARG BRANCH=main ARG ODS_CI_REPO=https://github.com/red-hat-data-services/ods-ci # This git reference should always reference a stable commit from ods-ci that supports ODH # This hash corresponds to a March 24th, 2023 commit -ARG ODS_CI_GITREF=867a617bc224726cf98fa3354293f8e50b4f5eb5 -ARG OC_CLI_URL=https://mirror.openshift.com/pub/openshift-v4/amd64/clients/ocp/latest/openshift-client-linux.tar.gz +ARG ODS_CI_GITREF=a8cf770b37caa4ef7ce6596acc8bdd6866cc7772 +ARG OC_CLI_URL=https://mirror.openshift.com/pub/openshift-v4/amd64/clients/ocp/4.14.33/openshift-client-linux.tar.gz ENV HOME /root WORKDIR /root -RUN dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm &&\ - dnf install -y jq bc git go-toolset python3.11 python3.11-pip python3.11-devel unzip && \ +RUN dnf install -y jq bc git go-toolset python3.11 python3.11-devel python3.11-pip unzip && \ dnf clean all && \ git clone https://github.com/opendatahub-io/peak $HOME/peak && \ cd $HOME/peak && \ @@ -22,14 +21,6 @@ RUN dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.n RUN curl -L https://github.com/mikefarah/yq/releases/download/v4.25.1/yq_linux_amd64 -o /usr/bin/yq &&\ chmod +x /usr/bin/yq -RUN mkdir -p $HOME/src && \ - cd $HOME/src && \ - git clone --depth=1 --branch ${BRANCH} https://github.com/${ORG}/trustyai-explainability && \ - # Clone ods-ci repo at specified git ref for the ODH Dashboard webUI tests - git clone --depth=1 ${ODS_CI_REPO} ods-ci && cd ods-ci && \ - git fetch origin ${ODS_CI_GITREF} && git checkout FETCH_HEAD && \ - chmod -R 777 $HOME/src - # Use a specific destination file name in case the url download name changes ADD ${OC_CLI_URL} $HOME/peak/oc-cli.tar.gz RUN tar -C /usr/local/bin -xvf $HOME/peak/oc-cli.tar.gz && \ @@ -37,16 +28,6 @@ RUN tar -C /usr/local/bin -xvf $HOME/peak/oc-cli.tar.gz && \ COPY Pipfile Pipfile.lock $HOME/peak/ -RUN pip3 install micropipenv &&\ - ln -s `which pip3` /usr/bin/pip &&\ - cd $HOME/peak &&\ - micropipenv install - -# Install poetry to support the exeuction of ods-ci test framework -RUN curl -sSL https://install.python-poetry.org | python3 - -ENV PATH="${PATH}:$HOME/.local/bin" -RUN cd $HOME/src/ods-ci && poetry install - ## Grab CI scripts from single-source-of-truth RUN mkdir -p $HOME/peak/operator-tests/trustyai-explainability/ &&\ mkdir $HOME/kfdef/ &&\ From 883c3fbb18656943b3a3b35e418b20c75e13296e Mon Sep 17 00:00:00 2001 From: Rob Geada Date: Mon, 29 Jul 2024 11:46:00 +0100 Subject: [PATCH 04/24] Restore checkout of trustyai-exp (#265) --- tests/Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/Dockerfile b/tests/Dockerfile index 5677c36..e39ebdf 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -21,6 +21,11 @@ RUN dnf install -y jq bc git go-toolset python3.11 python3.11-devel python3.11-p RUN curl -L https://github.com/mikefarah/yq/releases/download/v4.25.1/yq_linux_amd64 -o /usr/bin/yq &&\ chmod +x /usr/bin/yq +RUN mkdir -p $HOME/src && \ + cd $HOME/src && \ + git clone --depth=1 --branch ${BRANCH} https://github.com/${ORG}/trustyai-explainability && \ + chmod -R 777 $HOME/src + # Use a specific destination file name in case the url download name changes ADD ${OC_CLI_URL} $HOME/peak/oc-cli.tar.gz RUN tar -C /usr/local/bin -xvf $HOME/peak/oc-cli.tar.gz && \ From 4a52d652652501a63c4ed450ce8b13c159510d6a Mon Sep 17 00:00:00 2001 From: Rob Geada Date: Mon, 29 Jul 2024 14:40:58 +0100 Subject: [PATCH 05/24] Add operator installation robustness (#266) --- tests/scripts/install.sh | 63 ++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/tests/scripts/install.sh b/tests/scripts/install.sh index 46eb7d2..cc47a54 100755 --- a/tests/scripts/install.sh +++ b/tests/scripts/install.sh @@ -14,14 +14,44 @@ if ! [ -z "${SKIP_OPERATOR_INSTALL}" ]; then ./setup.sh -t ~/peak/operatorsetup 2>&1 else echo "Installing operator from community marketplace" - while [[ $retry -gt 0 ]]; do - # patch bug in peak setup script - sed -i "s/path=\"{.status.channels.*/ | jq '.status.channels | .[0].currentCSVDesc.installModes | map(select(.type == \"AllNamespaces\")) | .[0].supported')/" setup.sh - sed -i "s/csource=.*/echo \$3; csource=\$3/" setup.sh - sed -i 's/installop \$.*/installop \${vals[0]} \${vals[1]} \${vals[3]}/' setup.sh + start_t=$(date +%s) 2>&1 + ready=false 2>&1 + while ! $ready; do + CATALOG_SOURCES=$(oc get catalogsources -n openshift-marketplace 2> /dev/null | grep 'community-operators') + if [ ! -z "${CATALOG_SOURCES}" ]; then + echo $CATALOG_SOURCES + ready=true 2>&1 + else + sleep 10 + fi + if [ $(($(date +%s)-start_t)) -gt 300 ]; then + echo "Marketplace pods never started" + exit 1 + fi + done + + start_t=$(date +%s) 2>&1 + ready=false 2>&1 + while ! $ready; do + MANIFESTS=$(oc get packagemanifests -n openshift-marketplace 2> /dev/null | grep 'opendatahub') + echo $MANIFESTS + if [ ! -z "${MANIFESTS}" ]; then + echo $MANIFESTS + ready=true 2>&1 + else + sleep 10 + fi + if [ $(($(date +%s)-start_t)) -gt 900 ]; then + echo "Package manifests never downloaded" + exit 1 + fi + done + + while [[ $retry -gt 0 ]]; do + ./setup.sh -o ~/peak/operatorsetup\ - ./setup.sh -o ~/peak/operatorsetup + # approve installplans if [ $? -eq 0 ]; then retry=-1 else @@ -31,11 +61,16 @@ else fi retry=$(( retry - 1)) + sleep 30 + echo "Approving Install Plans, if needed" + oc patch installplan $(oc get installplan -n openshift-operators | grep $ODH_VERSION | awk '{print $1}') -n openshift-operators --type merge --patch '{"spec":{"approved":true}}' || true + oc patch installplan $(oc get installplan -n openshift-operators | grep authorino | awk '{print $1}') -n openshift-operators --type merge --patch '{"spec":{"approved":true}}' || true + finished=false 2>&1 start_t=$(date +%s) 2>&1 echo "Verifying installation of ODH operator" while ! $finished; do - if [ ! -z "$(oc get pods -n openshift-operators | grep 'opendatahub-operator-controller-manager' | grep '1/1')" ]; then + if [ ! -z "$(oc get pods -n openshift-operators | grep 'opendatahub-operator-controller-manager' | grep '1/1')" ]; then finished=true 2>&1 else sleep 10 @@ -50,20 +85,6 @@ else done fi -#popd -### Grabbing and applying the patch in the PR we are testing -#pushd ~/src/${REPO_NAME} -#if [ -z "$PULL_NUMBER" ]; then -# echo "No pull number, assuming nightly run" -#else -# if [ $REPO_OWNER == "trustyai-explainability" ]; then -# curl -O -L https://github.com/${REPO_OWNER}/${REPO_NAME}/pull/${PULL_NUMBER}.patch -# echo "Applying followng patch:" -# cat ${PULL_NUMBER}.patch > ${ARTIFACT_DIR}/github-pr-${PULL_NUMBER}.patch -# git apply ${PULL_NUMBER}.patch -# fi -#fi - popd ## Point manifests repo uri in the KFDEF to the manifests in the PR pushd ~/kfdef From ec4462ed07b7b6f0d86e985bb83db015f175ea29 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Mon, 29 Jul 2024 21:07:58 +0100 Subject: [PATCH 06/24] fix: Skip InferenceService patching for KServe RawDeployment (#262) --- controllers/inference_services.go | 36 ++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/controllers/inference_services.go b/controllers/inference_services.go index 773d492..5f22e47 100644 --- a/controllers/inference_services.go +++ b/controllers/inference_services.go @@ -12,6 +12,12 @@ import ( "strings" ) +const ( + DEPLOYMENT_MODE_MODELMESH = "ModelMesh" + DEPLOYMENT_MODE_RAW = "RawDeployment" + DEPLOYMENT_MODE_SERVERLESS = "Serverless" +) + func (r *TrustyAIServiceReconciler) patchEnvVarsForDeployments(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService, deployments []appsv1.Deployment, envVarName string, url string, remove bool) (bool, error) { // Create volume and volume mount for this intance's TLS secrets certVolumes := TLSCertVolumes{} @@ -199,20 +205,26 @@ func (r *TrustyAIServiceReconciler) handleInferenceServices(ctx context.Context, for _, infService := range inferenceServices.Items { annotations := infService.GetAnnotations() - // Check the annotation "serving.kserve.io/deploymentMode: ModelMesh" - if val, ok := annotations["serving.kserve.io/deploymentMode"]; ok && val == "ModelMesh" { - shouldContinue, err := r.patchEnvVarsByLabelForDeployments(ctx, instance, namespace, labelKey, labelValue, envVarName, crName, remove) - if err != nil { - log.FromContext(ctx).Error(err, "Could not patch environment variables for ModelMesh deployments.") - return shouldContinue, err - } - } else { - err := r.patchKServe(ctx, instance, infService, namespace, crName, remove) - if err != nil { - log.FromContext(ctx).Error(err, "Could not path InferenceLogger for KServe deployment.") - return false, err + + // Check the annotation "serving.kserve.io/deploymentMode" + if val, ok := annotations["serving.kserve.io/deploymentMode"]; ok { + if val == DEPLOYMENT_MODE_RAW { + log.FromContext(ctx).Info("RawDeployment mode not supported by TrustyAI") + continue + } else if val == DEPLOYMENT_MODE_MODELMESH { + shouldContinue, err := r.patchEnvVarsByLabelForDeployments(ctx, instance, namespace, labelKey, labelValue, envVarName, crName, remove) + if err != nil { + log.FromContext(ctx).Error(err, "could not patch environment variables for ModelMesh deployments") + return shouldContinue, err + } + continue } } + err := r.patchKServe(ctx, instance, infService, namespace, crName, remove) + if err != nil { + log.FromContext(ctx).Error(err, "could not patch InferenceLogger for KServe deployment") + return false, err + } } return true, nil } From c537e14a45b1b9d537e764fc1e6d6a0ff76a56c7 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Wed, 31 Jul 2024 12:56:21 +0100 Subject: [PATCH 07/24] feat: ConfigMap key to disable KServe Serverless configuration (#267) --- config/base/kustomization.yaml | 7 ++++++ config/base/params.env | 1 + controllers/config_maps.go | 38 +++++++++++++++++++++++++++++++ controllers/constants.go | 7 +++--- controllers/inference_services.go | 16 +++++++++---- 5 files changed, 62 insertions(+), 7 deletions(-) diff --git a/config/base/kustomization.yaml b/config/base/kustomization.yaml index ab0b973..0247b9b 100644 --- a/config/base/kustomization.yaml +++ b/config/base/kustomization.yaml @@ -41,3 +41,10 @@ vars: apiVersion: v1 fieldref: fieldpath: data.oauthProxyImage + - name: kServeServerless + objref: + kind: ConfigMap + name: config + apiVersion: v1 + fieldref: + fieldpath: data.kServeServerless \ No newline at end of file diff --git a/config/base/params.env b/config/base/params.env index 68d67aa..a0f5419 100644 --- a/config/base/params.env +++ b/config/base/params.env @@ -1,3 +1,4 @@ trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0 +kServeServerless=disabled \ No newline at end of file diff --git a/controllers/config_maps.go b/controllers/config_maps.go index 6eceb84..a4885ab 100644 --- a/controllers/config_maps.go +++ b/controllers/config_maps.go @@ -46,6 +46,44 @@ func (r *TrustyAIServiceReconciler) getImageFromConfigMap(ctx context.Context, k } } +// getKServeServerlessConfig checks the kServeServerless value in a ConfigMap in the operator's namespace +func (r *TrustyAIServiceReconciler) getKServeServerlessConfig(ctx context.Context) (bool, error) { + + if r.Namespace != "" { + // Define the key for the ConfigMap + configMapKey := types.NamespacedName{ + Namespace: r.Namespace, + Name: imageConfigMap, + } + + // Create an empty ConfigMap object + var cm corev1.ConfigMap + + // Try to get the ConfigMap + if err := r.Get(ctx, configMapKey, &cm); err != nil { + if errors.IsNotFound(err) { + // ConfigMap not found, return false as the default behavior + return false, nil + } + // Other error occurred when trying to fetch the ConfigMap + return false, fmt.Errorf("error reading configmap %s", configMapKey) + } + + // ConfigMap is found, extract the kServeServerless value + kServeServerless, ok := cm.Data[configMapkServeServerlessKey] + + if !ok || kServeServerless != "enabled" { + // Key is missing or its value is not "enabled", return false + return false, nil + } + + // kServeServerless is "enabled" + return true, nil + } else { + return false, nil + } +} + // getConfigMapNamesWithLabel retrieves the names of ConfigMaps that have the specified label func (r *TrustyAIServiceReconciler) getConfigMapNamesWithLabel(ctx context.Context, namespace string, labelSelector client.MatchingLabels) ([]string, error) { configMapList := &corev1.ConfigMapList{} diff --git a/controllers/constants.go b/controllers/constants.go index 1fc4eb9..2c7081b 100644 --- a/controllers/constants.go +++ b/controllers/constants.go @@ -26,9 +26,10 @@ const ( // Configuration constants const ( - imageConfigMap = "trustyai-service-operator-config" - configMapOAuthProxyImageKey = "oauthProxyImage" - configMapServiceImageKey = "trustyaiServiceImage" + imageConfigMap = "trustyai-service-operator-config" + configMapOAuthProxyImageKey = "oauthProxyImage" + configMapServiceImageKey = "trustyaiServiceImage" + configMapkServeServerlessKey = "kServeServerless" ) // OAuth constants diff --git a/controllers/inference_services.go b/controllers/inference_services.go index 5f22e47..db426d4 100644 --- a/controllers/inference_services.go +++ b/controllers/inference_services.go @@ -199,6 +199,12 @@ func (r *TrustyAIServiceReconciler) handleInferenceServices(ctx context.Context, return false, err } + kServeServerlessEnabled, err := r.getKServeServerlessConfig(ctx) + if err != nil { + log.FromContext(ctx).Error(err, "Could not read KServeServerless configuration. Defaulting to disabled") + kServeServerlessEnabled = false + } + if len(inferenceServices.Items) == 0 { return true, nil } @@ -220,10 +226,12 @@ func (r *TrustyAIServiceReconciler) handleInferenceServices(ctx context.Context, continue } } - err := r.patchKServe(ctx, instance, infService, namespace, crName, remove) - if err != nil { - log.FromContext(ctx).Error(err, "could not patch InferenceLogger for KServe deployment") - return false, err + if kServeServerlessEnabled { + err := r.patchKServe(ctx, instance, infService, namespace, crName, remove) + if err != nil { + log.FromContext(ctx).Error(err, "could not patch InferenceLogger for KServe deployment") + return false, err + } } } return true, nil From 1515872f008fd8c646e726a25f0881025dea1ee1 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Thu, 1 Aug 2024 18:54:49 +0100 Subject: [PATCH 08/24] feat: Add support for custom certificates in database connection (#259) --- controllers/deployment.go | 15 ++++++++ controllers/secrets.go | 38 +++++++++++-------- .../templates/service/deployment.tmpl.yaml | 17 ++++++++- 3 files changed, 54 insertions(+), 16 deletions(-) diff --git a/controllers/deployment.go b/controllers/deployment.go index 816c9c1..81be18a 100644 --- a/controllers/deployment.go +++ b/controllers/deployment.go @@ -38,6 +38,7 @@ type DeploymentConfig struct { CustomCertificatesBundle CustomCertificatesBundle Version string BatchSize int + UseDBTLSCerts bool } // createDeploymentObject returns a Deployment for the TrustyAI Service instance @@ -70,6 +71,20 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context, BatchSize: batchSize, } + if instance.Spec.Storage.IsStorageDatabase() { + _, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace) + if err != nil { + deploymentConfig.UseDBTLSCerts = false + log.FromContext(ctx).Error(err, "Using insecure database connection. Certificates "+instance.Name+"-db-tls not found") + } else { + deploymentConfig.UseDBTLSCerts = true + log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls") + } + } else { + deploymentConfig.UseDBTLSCerts = false + log.FromContext(ctx).Info("No need to check database secrets. Using PVC-mode.") + } + var deployment *appsv1.Deployment deployment, err = templateParser.ParseResource[appsv1.Deployment](deploymentTemplatePath, deploymentConfig, reflect.TypeOf(&appsv1.Deployment{})) if err != nil { diff --git a/controllers/secrets.go b/controllers/secrets.go index 6e2d49b..090ab58 100644 --- a/controllers/secrets.go +++ b/controllers/secrets.go @@ -9,34 +9,42 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +// getSecret retrieves a secret if it exists, returns an error if not +func (r *TrustyAIServiceReconciler) getSecret(ctx context.Context, name, namespace string) (*corev1.Secret, error) { + secret := &corev1.Secret{} + err := r.Get(ctx, client.ObjectKey{Name: name, Namespace: namespace}, secret) + if err != nil { + if errors.IsNotFound(err) { + return nil, fmt.Errorf("secret %s not found in namespace %s: %w", name, namespace, err) + } + return nil, fmt.Errorf("failed to get secret %s in namespace %s: %w", name, namespace, err) + } + return secret, nil +} + // findDatabaseSecret finds the DB configuration secret named (specified or default) in the same namespace as the CR func (r *TrustyAIServiceReconciler) findDatabaseSecret(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (*corev1.Secret, error) { databaseConfigurationsName := instance.Spec.Storage.DatabaseConfigurations defaultDatabaseConfigurationsName := instance.Name + dbCredentialsSuffix - secret := &corev1.Secret{} - if databaseConfigurationsName != "" { - secret := &corev1.Secret{} - err := r.Get(ctx, client.ObjectKey{Name: databaseConfigurationsName, Namespace: instance.Namespace}, secret) - if err == nil { - return secret, nil + secret, err := r.getSecret(ctx, databaseConfigurationsName, instance.Namespace) + if err != nil { + return nil, err } - if !errors.IsNotFound(err) { - return nil, fmt.Errorf("failed to get secret %s in namespace %s: %w", databaseConfigurationsName, instance.Namespace, err) + if secret != nil { + return secret, nil } } else { // If specified not found, try the default - - err := r.Get(ctx, client.ObjectKey{Name: defaultDatabaseConfigurationsName, Namespace: instance.Namespace}, secret) - if err == nil { - return secret, nil + secret, err := r.getSecret(ctx, defaultDatabaseConfigurationsName, instance.Namespace) + if err != nil { + return nil, err } - if !errors.IsNotFound(err) { - return nil, fmt.Errorf("failed to get secret %s in namespace %s: %w", defaultDatabaseConfigurationsName, instance.Namespace, err) + if secret != nil { + return secret, nil } - } return nil, fmt.Errorf("neither secret %s nor %s found in namespace %s", databaseConfigurationsName, defaultDatabaseConfigurationsName, instance.Namespace) diff --git a/controllers/templates/service/deployment.tmpl.yaml b/controllers/templates/service/deployment.tmpl.yaml index 840421f..fa8a9c7 100644 --- a/controllers/templates/service/deployment.tmpl.yaml +++ b/controllers/templates/service/deployment.tmpl.yaml @@ -94,7 +94,11 @@ spec: name: {{ .Instance.Spec.Storage.DatabaseConfigurations }} key: databasePort - name: QUARKUS_DATASOURCE_JDBC_URL + {{ if .UseDBTLSCerts }} + value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/trustyai_database?sslMode=verify-ca&serverSslCert=/etc/tls/db/tls.crt" + {{ else }} value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/trustyai_database" + {{ end }} - name: SERVICE_DATA_FORMAT value: "HIBERNATE" - name: QUARKUS_DATASOURCE_GENERATION @@ -121,7 +125,12 @@ spec: - name: {{ .VolumeMountName }} mountPath: {{ .Instance.Spec.Storage.Folder }} readOnly: false - {{ end }} + {{ end }} + {{ if .UseDBTLSCerts }} + - name: db-tls-certs + mountPath: /etc/tls/db + readOnly: true + {{ end }} - resources: limits: cpu: 100m @@ -209,3 +218,9 @@ spec: secret: secretName: {{ .Instance.Name }}-internal defaultMode: 420 + {{ if .UseDBTLSCerts }} + - name: db-tls-certs + secret: + secretName: {{ .Instance.Name }}-db-tls + defaultMode: 420 + {{ end }} From 91ba2c56fb0ae4aeaa17c610d637e3df394bb10a Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Thu, 1 Aug 2024 18:55:17 +0100 Subject: [PATCH 09/24] Add TLS endpoint for ModelMesh payload processors. (#268) Keep non-TLS endpoint for KServe Serverless (disabled by default) --- controllers/inference_services.go | 4 ++-- controllers/utils.go | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/controllers/inference_services.go b/controllers/inference_services.go index db426d4..b49a526 100644 --- a/controllers/inference_services.go +++ b/controllers/inference_services.go @@ -147,7 +147,7 @@ func (r *TrustyAIServiceReconciler) patchEnvVarsByLabelForDeployments(ctx contex } // Build the payload processor endpoint - url := generateServiceURL(crName, namespace) + "/consumer/kserve/v2" + url := generateTLSServiceURL(crName, namespace) + "/consumer/kserve/v2" // Patch environment variables for the Deployments if shouldContinue, err := r.patchEnvVarsForDeployments(ctx, instance, deployments, envVarName, url, remove); err != nil { @@ -240,7 +240,7 @@ func (r *TrustyAIServiceReconciler) handleInferenceServices(ctx context.Context, // patchKServe adds a TrustyAI service as an InferenceLogger to a KServe InferenceService func (r *TrustyAIServiceReconciler) patchKServe(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService, infService kservev1beta1.InferenceService, namespace string, crName string, remove bool) error { - url := generateServiceURL(crName, namespace) + url := generateNonTLSServiceURL(crName, namespace) if remove { if infService.Spec.Predictor.Logger == nil || *infService.Spec.Predictor.Logger.URL != url { diff --git a/controllers/utils.go b/controllers/utils.go index 91ee9bd..e96fc4e 100644 --- a/controllers/utils.go +++ b/controllers/utils.go @@ -62,7 +62,12 @@ func (r *TrustyAIServiceReconciler) GetDeploymentsByLabel(ctx context.Context, n return deployments.Items, nil } -// generateServiceURL generates an internal URL for a TrustyAI service -func generateServiceURL(crName string, namespace string) string { +// generateTLSServiceURL generates an internal URL for a TLS-enabled TrustyAI service +func generateTLSServiceURL(crName string, namespace string) string { + return "https://" + crName + "." + namespace + ".svc" +} + +// generateNonTLSServiceURL generates an internal URL for a TrustyAI service +func generateNonTLSServiceURL(crName string, namespace string) string { return "http://" + crName + "." + namespace + ".svc" } From d552762ee3ca7173e71932d91cad61afe2bdf80a Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Mon, 5 Aug 2024 13:37:38 +0100 Subject: [PATCH 10/24] fix: Correct maxSurge and maxUnavailable (#275) --- controllers/templates/service/deployment.tmpl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/controllers/templates/service/deployment.tmpl.yaml b/controllers/templates/service/deployment.tmpl.yaml index fa8a9c7..da69b24 100644 --- a/controllers/templates/service/deployment.tmpl.yaml +++ b/controllers/templates/service/deployment.tmpl.yaml @@ -15,8 +15,8 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxUnavailable: 0 - maxSurge: 1 + maxUnavailable: 1 + maxSurge: 0 replicas: 1 selector: matchLabels: From 4458d0d726118cd25ec97671f63f7d750ac110d4 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Wed, 7 Aug 2024 19:26:30 +0100 Subject: [PATCH 11/24] feat: Add support for custom DB names (#257) * feat: Add support for custom DB names * fix: Correct custom DB name --- controllers/deployment_test.go | 22 +++++++++++++++---- controllers/secrets.go | 9 +++++++- controllers/suite_test.go | 3 ++- .../templates/service/deployment.tmpl.yaml | 9 ++++++-- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/controllers/deployment_test.go b/controllers/deployment_test.go index de80a1c..0aa0f92 100644 --- a/controllers/deployment_test.go +++ b/controllers/deployment_test.go @@ -359,7 +359,7 @@ var _ = Describe("TrustyAI operator", func() { namespace := "trusty-ns-a-1-db" instance = createDefaultDBCustomResource(namespace) WaitFor(func() error { - secret := createDatabaseConfiguration(namespace, defaultDatabaseConfigurationName, "mysql") + secret := createDatabaseConfiguration(namespace, defaultDatabaseConfigurationName, "mysql", "trustyai_service") return k8sClient.Create(ctx, secret) }, "failed to create ConfigMap") setupAndTestDeploymentDefault(instance, namespace) @@ -368,7 +368,7 @@ var _ = Describe("TrustyAI operator", func() { namespace := "trusty-ns-a-1-db" instance = createDefaultDBCustomResource(namespace) WaitFor(func() error { - secret := createDatabaseConfiguration(namespace, defaultDatabaseConfigurationName, "mariadb") + secret := createDatabaseConfiguration(namespace, defaultDatabaseConfigurationName, "mariadb", "trustyai_service") return k8sClient.Create(ctx, secret) }, "failed to create ConfigMap") setupAndTestDeploymentDefault(instance, namespace) @@ -584,9 +584,16 @@ var _ = Describe("TrustyAI operator", func() { Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databasePort"), "Secret key does not match") + envVar = foundEnvVar(trustyaiServiceContainer.Env, "DATABASE_NAME") + Expect(envVar).NotTo(BeNil(), "Env var DATABASE_NAME not found") + Expect(envVar.ValueFrom).NotTo(BeNil(), "Env var DATABASE_NAME does not have ValueFrom set") + Expect(envVar.ValueFrom.SecretKeyRef).NotTo(BeNil(), "Env var DATABASE_NAME is not using SecretKeyRef") + Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") + Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databaseName"), "Secret key does not match") + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_JDBC_URL") Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_JDBC_URL not found") - Expect(envVar.Value).To(Equal("jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/trustyai_database")) + Expect(envVar.Value).To(Equal("jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/${DATABASE_NAME}")) }) @@ -695,9 +702,16 @@ var _ = Describe("TrustyAI operator", func() { Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databasePort"), "Secret key does not match") + envVar = foundEnvVar(trustyaiServiceContainer.Env, "DATABASE_NAME") + Expect(envVar).NotTo(BeNil(), "Env var DATABASE_NAME not found") + Expect(envVar.ValueFrom).NotTo(BeNil(), "Env var DATABASE_NAME does not have ValueFrom set") + Expect(envVar.ValueFrom.SecretKeyRef).NotTo(BeNil(), "Env var DATABASE_NAME is not using SecretKeyRef") + Expect(envVar.ValueFrom.SecretKeyRef.Name).To(Equal(defaultDatabaseConfigurationName), "Secret name does not match") + Expect(envVar.ValueFrom.SecretKeyRef.Key).To(Equal("databaseName"), "Secret key does not match") + envVar = foundEnvVar(trustyaiServiceContainer.Env, "QUARKUS_DATASOURCE_JDBC_URL") Expect(envVar).NotTo(BeNil(), "Env var QUARKUS_DATASOURCE_JDBC_URL not found") - Expect(envVar.Value).To(Equal("jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/trustyai_database")) + Expect(envVar.Value).To(Equal("jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/${DATABASE_NAME}")) }) diff --git a/controllers/secrets.go b/controllers/secrets.go index 090ab58..499d768 100644 --- a/controllers/secrets.go +++ b/controllers/secrets.go @@ -53,7 +53,14 @@ func (r *TrustyAIServiceReconciler) findDatabaseSecret(ctx context.Context, inst // validateDatabaseSecret validates the DB configuration secret func (r *TrustyAIServiceReconciler) validateDatabaseSecret(secret *corev1.Secret) error { - mandatoryKeys := []string{"databaseKind", "databaseUsername", "databasePassword", "databaseService", "databasePort"} + mandatoryKeys := []string{ + "databaseKind", + "databaseUsername", + "databasePassword", + "databaseService", + "databasePort", + "databaseName", + } for _, key := range mandatoryKeys { value, exists := secret.Data[key] diff --git a/controllers/suite_test.go b/controllers/suite_test.go index 7ee75a9..e61f4ac 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -215,13 +215,14 @@ func createSecret(namespace string, secretName string, data map[string]string) * } } -func createDatabaseConfiguration(namespace string, name string, dbKind string) *corev1.Secret { +func createDatabaseConfiguration(namespace string, name string, dbKind string, databaseName string) *corev1.Secret { return createSecret(namespace, name, map[string]string{ "databaseKind": dbKind, "databaseUsername": "foo", "databasePassword": "bar", "databaseService": "mariadb-service", "databasePort": "3306", + "databaseName": databaseName, }) } diff --git a/controllers/templates/service/deployment.tmpl.yaml b/controllers/templates/service/deployment.tmpl.yaml index da69b24..f1e429f 100644 --- a/controllers/templates/service/deployment.tmpl.yaml +++ b/controllers/templates/service/deployment.tmpl.yaml @@ -93,11 +93,16 @@ spec: secretKeyRef: name: {{ .Instance.Spec.Storage.DatabaseConfigurations }} key: databasePort + - name: DATABASE_NAME + valueFrom: + secretKeyRef: + name: {{ .Instance.Spec.Storage.DatabaseConfigurations }} + key: databaseName - name: QUARKUS_DATASOURCE_JDBC_URL {{ if .UseDBTLSCerts }} - value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/trustyai_database?sslMode=verify-ca&serverSslCert=/etc/tls/db/tls.crt" + value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/${DATABASE_NAME}?sslMode=verify-ca&serverSslCert=/etc/tls/db/tls.crt" {{ else }} - value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/trustyai_database" + value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/${DATABASE_NAME}" {{ end }} - name: SERVICE_DATA_FORMAT value: "HIBERNATE" From 1c92fc6421547aada556d66bd1c5af3b75976ae3 Mon Sep 17 00:00:00 2001 From: Adolfo Aguirrezabal Date: Thu, 22 Aug 2024 14:11:54 +0200 Subject: [PATCH 12/24] [CI] Run tests from trustyai-tests (#279) * Change Dockerfile to clone trustyai-tests * Add PYTEST_MARKERS env and remove TESTS_REGEX --- tests/Dockerfile | 19 ++++++++----------- tests/Makefile | 10 ++++++---- tests/README.md | 6 +----- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/tests/Dockerfile b/tests/Dockerfile index e39ebdf..833e698 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -1,11 +1,7 @@ FROM registry.access.redhat.com/ubi8:8.10-1020 - ARG ORG=trustyai-explainability ARG BRANCH=main ARG ODS_CI_REPO=https://github.com/red-hat-data-services/ods-ci -# This git reference should always reference a stable commit from ods-ci that supports ODH -# This hash corresponds to a March 24th, 2023 commit -ARG ODS_CI_GITREF=a8cf770b37caa4ef7ce6596acc8bdd6866cc7772 ARG OC_CLI_URL=https://mirror.openshift.com/pub/openshift-v4/amd64/clients/ocp/4.14.33/openshift-client-linux.tar.gz ENV HOME /root @@ -36,18 +32,19 @@ COPY Pipfile Pipfile.lock $HOME/peak/ ## Grab CI scripts from single-source-of-truth RUN mkdir -p $HOME/peak/operator-tests/trustyai-explainability/ &&\ mkdir $HOME/kfdef/ &&\ - cp -r $HOME/src/trustyai-explainability/tests/resources/ $HOME/peak/operator-tests/trustyai-explainability/resources &&\ - cp $HOME/src/trustyai-explainability/tests/util $HOME/peak/operator-tests/trustyai-explainability &&\ - cp -r $HOME/src/trustyai-explainability/tests/basictests $HOME/peak/operator-tests/trustyai-explainability/basictests &&\ cp -r $HOME/src/trustyai-explainability/tests/setup/odh-*.yaml $HOME/kfdef/ &&\ cp -r $HOME/src/trustyai-explainability/tests/setup/*setup $HOME/peak/ &&\ cp -r $HOME/src/trustyai-explainability/tests/scripts/installandtest.sh $HOME/peak/ +# Install poetry to support the exeuction of trustyai-tests +RUN curl -sSL https://install.python-poetry.org | python3 - +ENV PATH="${PATH}:$HOME/.local/bin" +RUN cd $HOME/peak && \ + git clone https://github.com/trustyai-explainability/trustyai-tests.git && \ + cd trustyai-tests && \ + poetry install + COPY scripts/install.sh $HOME/peak/ -#COPY resources $HOME/peak/operator-tests/trustyai-explainability/resources -#COPY util $HOME/peak/operator-tests/trustyai-explainability -#COPY setup/odh-core.yaml $HOME/kfdef/ -#COPY basictests $HOME/peak/operator-tests/trustyai-explainability/basictests RUN chmod -R 777 $HOME/kfdef && \ mkdir -p $HOME/.kube && \ diff --git a/tests/Makefile b/tests/Makefile index 8fa5698..a82e260 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -15,8 +15,8 @@ OPENSHIFT_TESTUSER_PASS= OPENSHIFT_TESTUSER_LOGIN_PROVIDER= # Setting SKIP_INSTALL will let you run the tests against an ODH instance that is already setup SKIP_INSTALL= -# Setting TESTS_REGEX will allow you to change which tests are going to be run -TESTS_REGEX= +# Pytest markers to select the tests that will be executed +PYTEST_MARKERS= # Location inside the container where CI system will retrieve files after a test run ARTIFACT_DIR=/tmp/artifacts LOCAL_ARTIFACT_DIR="${PWD}/artifacts" @@ -25,17 +25,19 @@ BUILD_TOOL?=podman NO_CACHE?=false LOCAL?=false TEARDOWN?=false +PLATFORM?=linux/amd64 + all: test test: build run clean build: - ${BUILD_TOOL} build -t $(IMAGE) --build-arg ORG=$(GIT_ORG) --build-arg BRANCH=$(GIT_BRANCH) --build-arg ODS_CI_REPO=$(ODS_CI_REPO) --build-arg ODS_CI_GITREF=$(ODS_CI_GITREF) --build-arg OC_CLI_URL=$(OC_CLI_URL) . + ${BUILD_TOOL} build -t $(IMAGE) --build-arg ORG=$(GIT_ORG) --build-arg BRANCH=$(GIT_BRANCH) --build-arg ODS_CI_REPO=$(ODS_CI_REPO) --build-arg ODS_CI_GITREF=$(ODS_CI_GITREF) --build-arg OC_CLI_URL=$(OC_CLI_URL) --platform=$(PLATFORM) . --progress=plain run: # Confirm that we have a directory for storing any screenshots from selenium tests mkdir -p ${LOCAL_ARTIFACT_DIR}/screenshots oc config view --flatten --minify > /tmp/tests-kubeconfig - ${BUILD_TOOL} run -e SKIP_INSTALL=$(SKIP_INSTALL) -e TESTS_REGEX=$(TESTS_REGEX) -e SKIP_OPERATOR_INSTALL=$(SKIP_OPERATOR_INSTALL) \ + ${BUILD_TOOL} run -e SKIP_INSTALL=$(SKIP_INSTALL) -e PYTEST_MARKERS=$(PYTEST_MARKERS) -e SKIP_OPERATOR_INSTALL=$(SKIP_OPERATOR_INSTALL) \ -e SKIP_KFDEF_INSTALL=$(SKIP_KFDEF_INSTALL) -e ODHPROJECT=$(ODHPROJECT) \ -e OPENSHIFT_TESTUSER_NAME="$(OPENSHIFT_TESTUSER_NAME)" -e OPENSHIFT_TESTUSER_PASS="$(OPENSHIFT_TESTUSER_PASS)" -e OPENSHIFT_TESTUSER_LOGIN_PROVIDER=$(OPENSHIFT_TESTUSER_LOGIN_PROVIDER) -e ARTIFACT_DIR=$(ARTIFACT_DIR) \ -e LOCAL=$(LOCAL) -e TEARDOWN=$(TEARDOWN)\ diff --git a/tests/README.md b/tests/README.md index 3dcbd3f..4864e3a 100644 --- a/tests/README.md +++ b/tests/README.md @@ -24,7 +24,7 @@ make clean # remove the artifacts of the test from the cluster (operator, ODH, p * `BUILD_TOOL=docker/podman`: set the tool used to build and run the testing container * `SKIP_INSTALL=true/false`: skip the install of the ODH operator, if you've already installed it manually or via a previous test * `SKIP_KFDEF_INSTALL=true/false`: skip the install of ODH via KFdef, if you've already installed it manually or via a previous test -* `TESTS_REGEX=${REGEX}`: only run tests whose names match the regex +* `PYTEST_MARKERS`: Used to select the tests that will be executed. [Available markers](https://github.com/trustyai-explainability/trustyai-tests/blob/main/pyproject.toml). * `LOCAL=true/false`: This flag makes the test suite stop and wait for user input between the end of a test script and cluster teardown. This prevents automatic teardown, which is useful for manual inspection of the cluster before teardown when running the tests locally. * `TEARDOWN=true/false`: This flag will just run the corresponding `teardown` functions within the various tests, useful for cleaning up stranded components from failed tests, without deleting the operator and ODH install. It's recommended to use this with a `make run`, as using `make test` will trigger a `make clean` that fully wipes the cluster. @@ -47,10 +47,6 @@ If you'd like to run the tests against an instance that already has a KfDef crea you set `SKIP_KFDEF_INSTALL=true` and that will cause the test run to skip the step of creating the default KfDef. example: `make run SKIP_KFDEF_INSTALL=true` -If you'd like to run a single test instead of all tests, you can -set the TESTS_REGEX variable `TESTS_REGEX=`. That will -only run the test that you specify instead of all of the tests. example: `make run TESTS_REGEX=grafana` - If you have a local instance already running the operator and you'd like to skip that part of the install process, you can set `SKIP_OPERATOR_INSTALL=true` and that will bypass installation of the operator, but will still install the authentication for any user tests. From 96a523d70b6928615913849401ca2ad638d05176 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Wed, 4 Sep 2024 11:40:53 +0100 Subject: [PATCH 13/24] RHOAIENG-12274: Update operator's overlays (#287) * Update operator's overlays * Update kustomization.yaml --- config/overlays/odh/params.env | 5 +++-- config/overlays/rhoai/kustomization.yaml | 4 ++++ config/overlays/rhoai/params.env | 4 ++++ 3 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 config/overlays/rhoai/params.env diff --git a/config/overlays/odh/params.env b/config/overlays/odh/params.env index 72d2cec..908f07c 100644 --- a/config/overlays/odh/params.env +++ b/config/overlays/odh/params.env @@ -1,3 +1,4 @@ -trustyaiServiceImage=quay.io/trustyai/trustyai-service:v0.13.0 -trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:v1.19.0 +trustyaiServiceImage=quay.io/trustyai/trustyai-service:v0.19.0 +trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:v1.25.0 oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0 +kServeServerless=enabled \ No newline at end of file diff --git a/config/overlays/rhoai/kustomization.yaml b/config/overlays/rhoai/kustomization.yaml index e6e984b..d4b4a2a 100644 --- a/config/overlays/rhoai/kustomization.yaml +++ b/config/overlays/rhoai/kustomization.yaml @@ -3,3 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - ../../base +configMapGenerator: + - env: params.env + behavior: merge + name: config diff --git a/config/overlays/rhoai/params.env b/config/overlays/rhoai/params.env new file mode 100644 index 0000000..2d60b1f --- /dev/null +++ b/config/overlays/rhoai/params.env @@ -0,0 +1,4 @@ +trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest +trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest +oauthProxyImage=registry.redhat.io/openshift4/ose-oauth-proxy@sha256:ab112105ac37352a2a4916a39d6736f5db6ab4c29bad4467de8d613e80e9bb33 +kServeServerless=disabled \ No newline at end of file From df96ffd84fbd9c2b767ade44b944be8b3ded036f Mon Sep 17 00:00:00 2001 From: Rob Geada Date: Fri, 6 Sep 2024 12:07:56 +0100 Subject: [PATCH 14/24] Add devflag printout to GH Action comment (#289) --- .github/workflows/build-and-push.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/build-and-push.yaml b/.github/workflows/build-and-push.yaml index 835fa14..85158a2 100644 --- a/.github/workflows/build-and-push.yaml +++ b/.github/workflows/build-and-push.yaml @@ -126,4 +126,12 @@ jobs: 📦 [PR image](https://quay.io/trustyai/trustyai-service-operator-ci:${{ github.event.pull_request.head.sha }}): `quay.io/trustyai/trustyai-service-operator-ci:${{ github.event.pull_request.head.sha }}` 🗂️ [CI manifests](https://github.com/trustyai-explainability/trustyai-service-operator-ci/tree/operator-${{ env.TAG }}) + + ``` + devFlags: + manifests: + - contextDir: config + sourcePath: '' + uri: https://api.github.com/repos/trustyai-explainability/trustyai-service-operator-ci/tarball/operator-${{ env.TAG }} + ``` From f366b6eabd56b5d635f3aaa157c1c4ca4d0bd4a8 Mon Sep 17 00:00:00 2001 From: Rob Geada Date: Mon, 30 Sep 2024 12:31:15 +0100 Subject: [PATCH 15/24] Add timeout loop to DSC install (#305) --- tests/scripts/install.sh | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/scripts/install.sh b/tests/scripts/install.sh index cc47a54..06eb6cc 100755 --- a/tests/scripts/install.sh +++ b/tests/scripts/install.sh @@ -128,13 +128,23 @@ else echo "Creating the following DSC" cat ./${DSC_FILENAME} > ${ARTIFACT_DIR}/${DSC_FILENAME} - oc apply -f ./odh-core-dsci.yaml - oc apply -f ./${DSC_FILENAME} - kfctl_result=$? - if [ "$kfctl_result" -ne 0 ]; then + start_t=$(date +%s) 2>&1 + ready=1 2>&1 + while [ "$ready" -ne 0 ]; do + oc apply -f ./odh-core-dsci.yaml + oc apply -f ./${DSC_FILENAME} + ready=$? + if [ $(($(date +%s)-start_t)) -gt 300 ]; then + echo "ODH DSC Installation timeout" + exit 1 + fi + sleep 10 + done + + if [ "$ready" -ne 0 ]; then echo "The installation failed" - exit $kfctl_result + exit $ready fi fi set +x From c072d519197b8a950f8897ed234384a1fc2227aa Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Mon, 30 Sep 2024 20:05:14 +0100 Subject: [PATCH 16/24] RHOAIENG-13625: Add DBAvailable status to CR (#304) * Add DBAvailable status to CR * Remove probes --- README.md | 10 ++++ config/base/params.env | 2 +- config/overlays/odh/params.env | 2 +- controllers/constants.go | 15 +++++ controllers/database.go | 63 ++++++++++++++++++++ controllers/deployment.go | 27 ++++++++- controllers/statuses.go | 59 ++++++++++++++----- controllers/statuses_test.go | 11 ++-- controllers/suite_test.go | 70 +++++++++++++++++++++-- controllers/trustyaiservice_controller.go | 19 ++++++ 10 files changed, 249 insertions(+), 29 deletions(-) create mode 100644 controllers/database.go diff --git a/README.md b/README.md index aa49524..14f1a38 100644 --- a/README.md +++ b/README.md @@ -151,10 +151,20 @@ through its `status` field. Below are the status types and reasons that are avai | `PVCAvailable` | `PVCNotFound` | `PersistentVolumeClaim` not found. | | `PVCAvailable` | `PVCFound` | `PersistentVolumeClaim` found. | +#### Database Status + +| Status Type | Status Reason | Description | +|---------------|-------------------------|---------------------------------------------------| +| `DBAvailable` | `DBCredentialsNotFound` | Database credentials secret not found | +| `DBAvailable` | `DBCredentialsError` | Database credentials malformed (e.g. missing key) | +| `DBAvailable` | `DBConnectionError` | Service error connecting to the database | +| `DBAvailable` | `DBAvailable` | Successfully connected to the database | + #### Status Behavior - If a PVC is not available, the `Ready` status of `TrustyAIService` will be set to `False`. +- If on database mode, any `DBAvailable` reason other than `DBAvailable` will set the `TrustyAIService` to `Not Ready` - However, if `InferenceServices` are not found, the `Ready` status of `TrustyAIService` will not be affected, _i.e._, it is `Ready` by all other conditions, it will remain so. ## Contributing diff --git a/config/base/params.env b/config/base/params.env index a0f5419..d745ad2 100644 --- a/config/base/params.env +++ b/config/base/params.env @@ -1,4 +1,4 @@ trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0 -kServeServerless=disabled \ No newline at end of file +kServeServerless=disabled diff --git a/config/overlays/odh/params.env b/config/overlays/odh/params.env index 908f07c..c67b2b5 100644 --- a/config/overlays/odh/params.env +++ b/config/overlays/odh/params.env @@ -1,4 +1,4 @@ trustyaiServiceImage=quay.io/trustyai/trustyai-service:v0.19.0 trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:v1.25.0 oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0 -kServeServerless=enabled \ No newline at end of file +kServeServerless=enabled diff --git a/controllers/constants.go b/controllers/constants.go index 2c7081b..18fdb4c 100644 --- a/controllers/constants.go +++ b/controllers/constants.go @@ -46,6 +46,7 @@ const ( StatusTypePVCAvailable = "PVCAvailable" StatusTypeRouteAvailable = "RouteAvailable" StatusTypeAvailable = "Available" + StatusTypeDBAvailable = "DBAvailable" ) // Status reasons @@ -58,6 +59,10 @@ const ( StatusReasonRouteFound = "RouteFound" StatusAvailable = "AllComponentsReady" StatusNotAvailable = "NotAllComponentsReady" + StatusDBCredentialsNotFound = "DBCredentialsNotFound" + StatusDBCredentialsError = "DBCredentialsError" + StatusDBConnectionError = "DBConnectionError" + StatusDBAvailable = "DBAvailable" ) // Event reasons @@ -67,4 +72,14 @@ const ( EventReasonServiceMonitorCreated = "ServiceMonitorCreated" ) +const ( + StateReasonCrashLoopBackOff = "CrashLoopBackOff" +) + +// Phases +const ( + PhaseReady = "Ready" + PhaseNotReady = "Not Ready" +) + const migrationAnnotationKey = "trustyai.opendatahub.io/db-migration" diff --git a/controllers/database.go b/controllers/database.go new file mode 100644 index 0000000..96679d0 --- /dev/null +++ b/controllers/database.go @@ -0,0 +1,63 @@ +package controllers + +import ( + "context" + "strings" + + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// checkDatabaseAccessible checks if the TrustyAI service pod failed with database issues. +func (r *TrustyAIServiceReconciler) checkDatabaseAccessible(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) { + deployment := &appsv1.Deployment{} + err := r.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, deployment) + if err != nil { + if errors.IsNotFound(err) { + return false, nil + } + return false, err + } + + for _, cond := range deployment.Status.Conditions { + if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue { + podList := &corev1.PodList{} + listOpts := []client.ListOption{ + client.InNamespace(instance.Namespace), + client.MatchingLabels(deployment.Spec.Selector.MatchLabels), + } + if err := r.List(ctx, podList, listOpts...); err != nil { + return false, err + } + + for _, pod := range podList.Items { + for _, cs := range pod.Status.ContainerStatuses { + if cs.Name == "trustyai-service" { + if cs.State.Running != nil { + return true, nil + } + + if cs.LastTerminationState.Terminated != nil { + termination := cs.LastTerminationState.Terminated + if termination.Reason == "Error" && termination.Message != "" { + if strings.Contains(termination.Message, "Socket fail to connect to host:address") { + return false, nil + } + } + } + + if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff { + return false, nil + } + } + } + } + } + } + + return false, nil +} diff --git a/controllers/deployment.go b/controllers/deployment.go index 81be18a..3b8d8f5 100644 --- a/controllers/deployment.go +++ b/controllers/deployment.go @@ -2,16 +2,18 @@ package controllers import ( "context" - templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates" "reflect" "strconv" + templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates" + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" ) @@ -75,7 +77,7 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context, _, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace) if err != nil { deploymentConfig.UseDBTLSCerts = false - log.FromContext(ctx).Error(err, "Using insecure database connection. Certificates "+instance.Name+"-db-tls not found") + log.FromContext(ctx).Info("Using insecure database connection. Certificates " + instance.Name + "-db-tls not found") } else { deploymentConfig.UseDBTLSCerts = true log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls") @@ -201,6 +203,7 @@ func (r *TrustyAIServiceReconciler) ensureDeployment(ctx context.Context, instan return nil } +// checkDeploymentReady verifies that a TrustyAI service deployment is ready func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) { deployment := &appsv1.Deployment{} @@ -215,6 +218,26 @@ func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, in for _, cond := range deployment.Status.Conditions { if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue { if deployment.Status.ReadyReplicas == *deployment.Spec.Replicas { + podList := &corev1.PodList{} + listOpts := []client.ListOption{ + client.InNamespace(instance.Namespace), + client.MatchingLabels(deployment.Spec.Selector.MatchLabels), + } + if err := r.List(ctx, podList, listOpts...); err != nil { + return false, err + } + + for _, pod := range podList.Items { + for _, cs := range pod.Status.ContainerStatuses { + if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff { + return false, nil + } + if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 { + return false, nil + } + } + } + return true, nil } } diff --git a/controllers/statuses.go b/controllers/statuses.go index b88ba47..9236e81 100644 --- a/controllers/statuses.go +++ b/controllers/statuses.go @@ -13,7 +13,8 @@ import ( // IsAllReady checks if all the necessary readiness fields are true for the specific mode func (rs *AvailabilityStatus) IsAllReady(mode string) bool { - return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) || (rs.DeploymentReady && rs.RouteReady && mode == STORAGE_DATABASE) + return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) || + (rs.DeploymentReady && rs.RouteReady && rs.DBReady && mode == STORAGE_DATABASE) } // AvailabilityStatus has the readiness status of various resources. @@ -22,6 +23,7 @@ type AvailabilityStatus struct { DeploymentReady bool RouteReady bool InferenceServiceReady bool + DBReady bool } func (r *TrustyAIServiceReconciler) updateStatus(ctx context.Context, original *trustyaiopendatahubiov1alpha1.TrustyAIService, update func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService), @@ -53,25 +55,17 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() { // Check for PVC readiness status.PVCReady, err = r.checkPVCReady(ctx, instance) - if err != nil || !status.PVCReady { - // PVC not ready, requeue - return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "PVC not ready") - } } // Check for deployment readiness status.DeploymentReady, err = r.checkDeploymentReady(ctx, instance) - if err != nil || !status.DeploymentReady { - // Deployment not ready, requeue - return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Deployment not ready") + + if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() { + status.DBReady, _ = r.checkDatabaseAccessible(ctx, instance) } // Check for route readiness status.RouteReady, err = r.checkRouteReady(ctx, instance) - if err != nil || !status.RouteReady { - // Route not ready, requeue - return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Route not ready") - } // Check if InferenceServices present status.InferenceServiceReady, err = r.checkInferenceServicesPresent(ctx, instance.Namespace) @@ -89,9 +83,15 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() { UpdatePVCAvailable(saved) } + UpdateRouteAvailable(saved) + + if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() { + UpdateDBAvailable(saved) + } + UpdateTrustyAIServiceAvailable(saved) - saved.Status.Phase = "Ready" + saved.Status.Phase = PhaseReady saved.Status.Ready = v1.ConditionTrue }) if updateErr != nil { @@ -114,13 +114,18 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta } } + if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() { + UpdateDBConnectionError(saved) + } + if status.RouteReady { UpdateRouteAvailable(saved) } else { UpdateRouteNotAvailable(saved) } + UpdateTrustyAIServiceNotAvailable(saved) - saved.Status.Phase = "Ready" + saved.Status.Phase = PhaseNotReady saved.Status.Ready = v1.ConditionFalse }) if updateErr != nil { @@ -143,7 +148,7 @@ func UpdateInferenceServicePresent(saved *trustyaiopendatahubiov1alpha1.TrustyAI func UpdatePVCNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { saved.SetStatus(StatusTypePVCAvailable, StatusReasonPVCNotFound, "PersistentVolumeClaim not found", v1.ConditionFalse) - saved.Status.Phase = "Not Ready" + saved.Status.Phase = PhaseNotReady saved.Status.Ready = v1.ConditionFalse } @@ -165,4 +170,28 @@ func UpdateTrustyAIServiceAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyA func UpdateTrustyAIServiceNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { saved.SetStatus(StatusTypeAvailable, StatusNotAvailable, "Not all components available", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBCredentialsNotFound(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsNotFound, "Database credentials not found", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBCredentialsError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsError, "Error with database credentials", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBConnectionError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBConnectionError, "Error connecting to database", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBAvailable, "Database available", v1.ConditionTrue) } diff --git a/controllers/statuses_test.go b/controllers/statuses_test.go index 9e395df..f6ad7fe 100644 --- a/controllers/statuses_test.go +++ b/controllers/statuses_test.go @@ -35,7 +35,7 @@ func setupAndTestStatusNoComponent(instance *trustyaiopendatahubiov1alpha1.Trust // Call the reconcileStatuses function _, _ = reconciler.reconcileStatuses(ctx, instance) - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionFalse), "Ready condition should be true") @@ -127,7 +127,7 @@ var _ = Describe("Status and condition tests", func() { }, instance) }, "failed to get updated instance") - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") @@ -191,7 +191,7 @@ var _ = Describe("Status and condition tests", func() { }, instance) }, "failed to get updated instance") - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") @@ -260,8 +260,7 @@ var _ = Describe("Status and condition tests", func() { Namespace: instance.Namespace, }, instance) }, "failed to get updated instance") - - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") @@ -344,7 +343,7 @@ var _ = Describe("Status and condition tests", func() { }, instance) }, "failed to get updated instance") - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") diff --git a/controllers/suite_test.go b/controllers/suite_test.go index e61f4ac..a2f16ba 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -20,11 +20,12 @@ import ( "context" "encoding/json" "fmt" - rbacv1 "k8s.io/api/rbac/v1" "path/filepath" "testing" "time" + rbacv1 "k8s.io/api/rbac/v1" + "github.com/google/uuid" kservev1alpha1 "github.com/kserve/kserve/pkg/apis/serving/v1alpha1" kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1" @@ -358,14 +359,75 @@ func makeDeploymentReady(ctx context.Context, k8sClient client.Client, instance Reason: "DeploymentReady", Message: "The deployment is ready", }, + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionTrue, + Reason: "NewReplicaSetAvailable", + Message: "ReplicaSet is progressing", + }, } if deployment.Spec.Replicas != nil { - deployment.Status.ReadyReplicas = 1 - deployment.Status.Replicas = 1 + deployment.Status.ReadyReplicas = *deployment.Spec.Replicas + deployment.Status.Replicas = *deployment.Spec.Replicas + deployment.Status.AvailableReplicas = *deployment.Spec.Replicas } - return k8sClient.Update(ctx, deployment) + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: instance.Name + "-pod", + Namespace: instance.Namespace, + Labels: deployment.Spec.Selector.MatchLabels, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "trustyai-service", + Image: "quay.io/trustyai/trustyai-service:latest", + Ports: []corev1.ContainerPort{ + { + ContainerPort: 8080, + }, + }, + }, + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }, + { + Type: corev1.ContainersReady, + Status: corev1.ConditionTrue, + }, + }, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "trustyai-service", + State: corev1.ContainerState{ + Running: &corev1.ContainerStateRunning{ + StartedAt: metav1.Now(), + }, + }, + Ready: true, + RestartCount: 0, + }, + }, + }, + } + + if err := k8sClient.Create(ctx, pod); err != nil { + return err + } + + if err := k8sClient.Status().Update(ctx, deployment); err != nil { + return err + } + + return nil } func makeRouteReady(ctx context.Context, k8sClient client.Client, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error { diff --git a/controllers/trustyaiservice_controller.go b/controllers/trustyaiservice_controller.go index d37fb26..edb60f2 100644 --- a/controllers/trustyaiservice_controller.go +++ b/controllers/trustyaiservice_controller.go @@ -25,6 +25,7 @@ import ( kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1" trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -156,10 +157,28 @@ func (r *TrustyAIServiceReconciler) Reconcile(ctx context.Context, req ctrl.Requ // Get database configuration secret, err := r.findDatabaseSecret(ctx, instance) if err != nil { + _, updateErr := r.updateStatus(ctx, instance, func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + UpdateDBCredentialsNotFound(saved) + UpdateTrustyAIServiceNotAvailable(saved) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse + }) + if updateErr != nil { + return RequeueWithErrorMessage(ctx, err, "Failed to update status") + } return RequeueWithErrorMessage(ctx, err, "Service configured to use database storage but no database configuration found.") } err = r.validateDatabaseSecret(secret) if err != nil { + _, updateErr := r.updateStatus(ctx, instance, func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + UpdateDBCredentialsError(saved) + UpdateTrustyAIServiceNotAvailable(saved) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse + }) + if updateErr != nil { + return RequeueWithErrorMessage(ctx, err, "Failed to update status") + } return RequeueWithErrorMessage(ctx, err, "Database configuration contains errors.") } } From 97a5a5d30d6bf98ed1f8505c1027beff1634b059 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Thu, 10 Oct 2024 14:04:29 +0100 Subject: [PATCH 17/24] Add KServe destination rule for Inference Services in the ServiceMesh (#315) * Add DestinationRule creation for KServe serverless * Add permissions for destination rules * Add role for destination rules * Add missing role for creating destination rules * Fix spacing in DestinationRule template --- config/overlays/odh/params.env | 4 +- config/rbac/role.yaml | 12 ++++ controllers/destination_rule.go | 69 +++++++++++++++++++ controllers/inference_services.go | 12 +++- .../service/destination-rule.tmpl.yaml | 13 ++++ controllers/trustyaiservice_controller.go | 1 + 6 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 controllers/destination_rule.go create mode 100644 controllers/templates/service/destination-rule.tmpl.yaml diff --git a/config/overlays/odh/params.env b/config/overlays/odh/params.env index c67b2b5..4a9c77e 100644 --- a/config/overlays/odh/params.env +++ b/config/overlays/odh/params.env @@ -1,4 +1,4 @@ -trustyaiServiceImage=quay.io/trustyai/trustyai-service:v0.19.0 -trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:v1.25.0 +trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest +trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0 kServeServerless=enabled diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index b6b21e4..2528617 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -134,6 +134,18 @@ rules: - create - list - watch +- apiGroups: + - networking.istio.io + resources: + - destinationrules + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - rbac.authorization.k8s.io resources: diff --git a/controllers/destination_rule.go b/controllers/destination_rule.go new file mode 100644 index 0000000..f683373 --- /dev/null +++ b/controllers/destination_rule.go @@ -0,0 +1,69 @@ +package controllers + +import ( + "context" + "fmt" + "reflect" + + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" + templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +const ( + destinationRuleTemplatePath = "service/destination-rule.tmpl.yaml" +) + +// DestinationRuleConfig has the variables for the DestinationRule template +type DestinationRuleConfig struct { + Name string + Namespace string + DestinationRuleName string +} + +func (r *TrustyAIServiceReconciler) ensureDestinationRule(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error { + destinationRuleName := instance.Name + "-internal" + + existingDestinationRule := &unstructured.Unstructured{} + existingDestinationRule.SetKind("DestinationRule") + existingDestinationRule.SetAPIVersion("networking.istio.io/v1beta1") + + // Check if the DestinationRule already exists + err := r.Get(ctx, types.NamespacedName{Name: destinationRuleName, Namespace: instance.Namespace}, existingDestinationRule) + if err == nil { + // DestinationRule exists + return nil + } + + if !errors.IsNotFound(err) { + return fmt.Errorf("failed to check for existing DestinationRule: %v", err) + } + + destinationRuleConfig := DestinationRuleConfig{ + Name: instance.Name, + Namespace: instance.Namespace, + DestinationRuleName: destinationRuleName, + } + + var destinationRule *unstructured.Unstructured + destinationRule, err = templateParser.ParseResource[unstructured.Unstructured](destinationRuleTemplatePath, destinationRuleConfig, reflect.TypeOf(&unstructured.Unstructured{})) + if err != nil { + log.FromContext(ctx).Error(err, "could not parse the DestinationRule template") + return err + } + + if err := ctrl.SetControllerReference(instance, destinationRule, r.Scheme); err != nil { + return err + } + + err = r.Create(ctx, destinationRule) + if err != nil { + return fmt.Errorf("failed to create DestinationRule: %v", err) + } + + return nil +} diff --git a/controllers/inference_services.go b/controllers/inference_services.go index b49a526..a4cad7b 100644 --- a/controllers/inference_services.go +++ b/controllers/inference_services.go @@ -3,13 +3,14 @@ package controllers import ( "context" "fmt" + "strings" + kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1" trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - "strings" ) const ( @@ -271,6 +272,15 @@ func (r *TrustyAIServiceReconciler) patchKServe(ctx context.Context, instance *t infService.Spec.Predictor.Logger = &logger } + // Only if the Istio sidecar annotation is set + annotations := infService.GetAnnotations() + if inject, exists := annotations["sidecar.istio.io/inject"]; exists && inject == "true" { + err := r.ensureDestinationRule(ctx, instance) + if err != nil { + return fmt.Errorf("failed to ensure DestinationRule: %v", err) + } + } + // Update the InferenceService err := r.Update(ctx, &infService) if err == nil { diff --git a/controllers/templates/service/destination-rule.tmpl.yaml b/controllers/templates/service/destination-rule.tmpl.yaml new file mode 100644 index 0000000..f62e548 --- /dev/null +++ b/controllers/templates/service/destination-rule.tmpl.yaml @@ -0,0 +1,13 @@ +apiVersion: networking.istio.io/v1beta1 +kind: DestinationRule +metadata: + name: {{ .DestinationRuleName }} + namespace: {{ .Namespace }} +spec: + host: {{ .Name }}.{{ .Namespace }}.svc.cluster.local + trafficPolicy: + portLevelSettings: + - port: + number: 443 + tls: + mode: SIMPLE diff --git a/controllers/trustyaiservice_controller.go b/controllers/trustyaiservice_controller.go index edb60f2..e920853 100644 --- a/controllers/trustyaiservice_controller.go +++ b/controllers/trustyaiservice_controller.go @@ -69,6 +69,7 @@ type TrustyAIServiceReconciler struct { //+kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create;update;delete //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;watch;create;update;delete //+kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;create;update +//+kubebuilder:rbac:groups=networking.istio.io,resources=destinationrules,verbs=create;list;watch;get;update;patch;delete // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. From d11408cf5a8f717e4d63112c21f2a7addfea62a0 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Thu, 10 Oct 2024 17:43:43 +0100 Subject: [PATCH 18/24] Add check if DestinationRule CRD is present before creating it (#316) * Add check for DestinationRule CRD * Add API extensions to operator's scheme * Add permission for CRD resource --- config/rbac/role.yaml | 8 ++++++++ controllers/destination_rule.go | 20 ++++++++++++++++++++ controllers/inference_services.go | 20 ++++++++++++++++++-- controllers/trustyaiservice_controller.go | 1 + go.mod | 2 +- main.go | 5 ++++- 6 files changed, 52 insertions(+), 4 deletions(-) diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 2528617..91be669 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -60,6 +60,14 @@ rules: - list - update - watch +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch - apiGroups: - apps resources: diff --git a/controllers/destination_rule.go b/controllers/destination_rule.go index f683373..ce17552 100644 --- a/controllers/destination_rule.go +++ b/controllers/destination_rule.go @@ -7,6 +7,7 @@ import ( trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/types" @@ -16,6 +17,7 @@ import ( const ( destinationRuleTemplatePath = "service/destination-rule.tmpl.yaml" + destinationRuleCDRName = "destinationrules.networking.istio.io" ) // DestinationRuleConfig has the variables for the DestinationRule template @@ -25,7 +27,25 @@ type DestinationRuleConfig struct { DestinationRuleName string } +// isDestinationRuleCRDPresent returns true if the DestinationRule CRD is present, false otherwise +func (r *TrustyAIServiceReconciler) isDestinationRuleCRDPresent(ctx context.Context) (bool, error) { + crd := &apiextensionsv1.CustomResourceDefinition{} + + err := r.Get(ctx, types.NamespacedName{Name: destinationRuleCDRName}, crd) + if err != nil { + if !errors.IsNotFound(err) { + return false, fmt.Errorf("error getting "+destinationRuleCDRName+" CRD: %v", err) + } + // Not found + return false, nil + } + + // Found + return true, nil +} + func (r *TrustyAIServiceReconciler) ensureDestinationRule(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error { + destinationRuleName := instance.Name + "-internal" existingDestinationRule := &unstructured.Unstructured{} diff --git a/controllers/inference_services.go b/controllers/inference_services.go index a4cad7b..ee8745b 100644 --- a/controllers/inference_services.go +++ b/controllers/inference_services.go @@ -275,10 +275,26 @@ func (r *TrustyAIServiceReconciler) patchKServe(ctx context.Context, instance *t // Only if the Istio sidecar annotation is set annotations := infService.GetAnnotations() if inject, exists := annotations["sidecar.istio.io/inject"]; exists && inject == "true" { - err := r.ensureDestinationRule(ctx, instance) + + // Check if DestinationRule CRD is present. If there's an error, don't proceed and return the error + exists, err := r.isDestinationRuleCRDPresent(ctx) if err != nil { - return fmt.Errorf("failed to ensure DestinationRule: %v", err) + log.FromContext(ctx).Error(err, "Error verifying DestinationRule CRD is present") + return err + } + + // Try to create the DestinationRule, since CRD exists + if exists { + err := r.ensureDestinationRule(ctx, instance) + if err != nil { + return fmt.Errorf("failed to ensure DestinationRule: %v", err) + } + } else { + // DestinationRule CRD does not exist. Do not attempt to create it and log error + err := fmt.Errorf("the DestinationRule CRD is not present in this cluster") + log.FromContext(ctx).Error(err, "InferenceService has service mesh annotation but DestinationRule CRD not found") } + } // Update the InferenceService diff --git a/controllers/trustyaiservice_controller.go b/controllers/trustyaiservice_controller.go index e920853..1176d98 100644 --- a/controllers/trustyaiservice_controller.go +++ b/controllers/trustyaiservice_controller.go @@ -70,6 +70,7 @@ type TrustyAIServiceReconciler struct { //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;watch;create;update;delete //+kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;create;update //+kubebuilder:rbac:groups=networking.istio.io,resources=destinationrules,verbs=create;list;watch;get;update;patch;delete +//+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=list;watch;get // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. diff --git a/go.mod b/go.mod index 6e15a91..fcb6b76 100644 --- a/go.mod +++ b/go.mod @@ -88,7 +88,7 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.26.4 // indirect + k8s.io/apiextensions-apiserver v0.26.4 k8s.io/component-base v0.26.4 // indirect k8s.io/klog/v2 v2.100.1 // indirect k8s.io/kube-openapi v0.0.0-20230515203736-54b630e78af5 // indirect diff --git a/main.go b/main.go index cc7fc1e..ce5ee19 100644 --- a/main.go +++ b/main.go @@ -18,11 +18,13 @@ package main import ( "flag" + "os" + kservev1alpha1 "github.com/kserve/kserve/pkg/apis/serving/v1alpha1" kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1" routev1 "github.com/openshift/api/route/v1" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - "os" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. @@ -52,6 +54,7 @@ func init() { utilruntime.Must(kservev1alpha1.AddToScheme(scheme)) utilruntime.Must(kservev1beta1.AddToScheme(scheme)) utilruntime.Must(routev1.AddToScheme(scheme)) + utilruntime.Must(apiextensionsv1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } From 91237c57cbebc2545850eb3d8cf17d16ea3d8a3f Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Mon, 14 Oct 2024 11:13:27 +0100 Subject: [PATCH 19/24] Fix operator metrics service target port (#320) --- config/rbac/auth_proxy_service.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/rbac/auth_proxy_service.yaml b/config/rbac/auth_proxy_service.yaml index cce9af6..8581450 100644 --- a/config/rbac/auth_proxy_service.yaml +++ b/config/rbac/auth_proxy_service.yaml @@ -16,6 +16,6 @@ spec: - name: https port: 8443 protocol: TCP - targetPort: 8080 + targetPort: 8081 selector: control-plane: controller-manager From 79797678120b356c9bed994fbbb021c60aa9b06d Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Mon, 14 Oct 2024 15:38:57 +0100 Subject: [PATCH 20/24] Add readiness probes (#312) --- controllers/database.go | 1 - controllers/statuses.go | 7 ++++++- controllers/templates/service/deployment.tmpl.yaml | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/controllers/database.go b/controllers/database.go index 96679d0..1c5d26a 100644 --- a/controllers/database.go +++ b/controllers/database.go @@ -12,7 +12,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -// checkDatabaseAccessible checks if the TrustyAI service pod failed with database issues. func (r *TrustyAIServiceReconciler) checkDatabaseAccessible(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) { deployment := &appsv1.Deployment{} err := r.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, deployment) diff --git a/controllers/statuses.go b/controllers/statuses.go index 9236e81..b43bbbd 100644 --- a/controllers/statuses.go +++ b/controllers/statuses.go @@ -87,7 +87,12 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta UpdateRouteAvailable(saved) if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() { - UpdateDBAvailable(saved) + if status.DBReady { + UpdateDBAvailable(saved) + } else { + UpdateDBConnectionError(saved) + return + } } UpdateTrustyAIServiceAvailable(saved) diff --git a/controllers/templates/service/deployment.tmpl.yaml b/controllers/templates/service/deployment.tmpl.yaml index f1e429f..e87091c 100644 --- a/controllers/templates/service/deployment.tmpl.yaml +++ b/controllers/templates/service/deployment.tmpl.yaml @@ -122,6 +122,20 @@ spec: - name: STORAGE_MIGRATION_CONFIG_FROM_FILENAME value: {{ .Instance.Spec.Data.Filename }} {{ end }} + readinessProbe: + httpGet: + path: /q/health/ready + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 + livenessProbe: + httpGet: + path: /q/health/live + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 2 volumeMounts: - name: {{ .Instance.Name }}-internal readOnly: false From 4893d79ab3971cd88ed994e62eca92462dc18a08 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Tue, 15 Oct 2024 15:10:06 +0100 Subject: [PATCH 21/24] Enable KServe serverless in the rhoai overlay (#321) --- config/base/params.env | 2 +- config/overlays/rhoai/params.env | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config/base/params.env b/config/base/params.env index d745ad2..4a9c77e 100644 --- a/config/base/params.env +++ b/config/base/params.env @@ -1,4 +1,4 @@ trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0 -kServeServerless=disabled +kServeServerless=enabled diff --git a/config/overlays/rhoai/params.env b/config/overlays/rhoai/params.env index 2d60b1f..c9cb9c3 100644 --- a/config/overlays/rhoai/params.env +++ b/config/overlays/rhoai/params.env @@ -1,4 +1,4 @@ trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest oauthProxyImage=registry.redhat.io/openshift4/ose-oauth-proxy@sha256:ab112105ac37352a2a4916a39d6736f5db6ab4c29bad4467de8d613e80e9bb33 -kServeServerless=disabled \ No newline at end of file +kServeServerless=enabled \ No newline at end of file From 4e3091fd1fbb7cd058940e6619066771bad81c15 Mon Sep 17 00:00:00 2001 From: Rob Geada Date: Thu, 17 Oct 2024 15:31:31 +0100 Subject: [PATCH 22/24] Update overlay images (#331) --- .github/workflows/build-and-push.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build-and-push.yaml b/.github/workflows/build-and-push.yaml index 85158a2..98bf455 100644 --- a/.github/workflows/build-and-push.yaml +++ b/.github/workflows/build-and-push.yaml @@ -91,6 +91,8 @@ jobs: if: env.BUILD_CONTEXT == 'ci' run: | sed -i "s#quay.io/trustyai/trustyai-service-operator:latest#${{ env.IMAGE_NAME }}:$TAG#" ./config/base/params.env + sed -i "s#quay.io/trustyai/trustyai-service-operator:latest#${{ env.IMAGE_NAME }}:$TAG#" ./config/overlays/odh/params.env + sed -i "s#quay.io/trustyai/trustyai-service-operator:latest#${{ env.IMAGE_NAME }}:$TAG#" ./config/overlays/rhoai/params.env rm -Rf $(ls . | grep -v config) rm -Rf .gitignore .dockerignore .github .git .yamllint.yaml # pysh to ci-manifest repo From fb54647fca2c52d070fdaf335c0a478afaf28086 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Thu, 17 Oct 2024 18:46:55 +0100 Subject: [PATCH 23/24] Add correct CA cert to JDBC (#324) * Add correct CA cert to JDBC * Add require SSL --- controllers/deployment.go | 6 +++--- controllers/templates/service/deployment.tmpl.yaml | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/controllers/deployment.go b/controllers/deployment.go index 3b8d8f5..4696b31 100644 --- a/controllers/deployment.go +++ b/controllers/deployment.go @@ -74,13 +74,13 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context, } if instance.Spec.Storage.IsStorageDatabase() { - _, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace) + _, err := r.getSecret(ctx, instance.Name+"-db-ca", instance.Namespace) if err != nil { deploymentConfig.UseDBTLSCerts = false - log.FromContext(ctx).Info("Using insecure database connection. Certificates " + instance.Name + "-db-tls not found") + log.FromContext(ctx).Info("Using insecure database connection. Certificates " + instance.Name + "-db-ca not found") } else { deploymentConfig.UseDBTLSCerts = true - log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls") + log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-ca") } } else { deploymentConfig.UseDBTLSCerts = false diff --git a/controllers/templates/service/deployment.tmpl.yaml b/controllers/templates/service/deployment.tmpl.yaml index e87091c..9257b69 100644 --- a/controllers/templates/service/deployment.tmpl.yaml +++ b/controllers/templates/service/deployment.tmpl.yaml @@ -100,7 +100,7 @@ spec: key: databaseName - name: QUARKUS_DATASOURCE_JDBC_URL {{ if .UseDBTLSCerts }} - value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/${DATABASE_NAME}?sslMode=verify-ca&serverSslCert=/etc/tls/db/tls.crt" + value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/${DATABASE_NAME}?requireSSL=true&sslMode=verify-ca&serverSslCert=/etc/tls/db/ca.crt" {{ else }} value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/${DATABASE_NAME}" {{ end }} @@ -146,7 +146,7 @@ spec: readOnly: false {{ end }} {{ if .UseDBTLSCerts }} - - name: db-tls-certs + - name: db-ca-cert mountPath: /etc/tls/db readOnly: true {{ end }} @@ -238,8 +238,8 @@ spec: secretName: {{ .Instance.Name }}-internal defaultMode: 420 {{ if .UseDBTLSCerts }} - - name: db-tls-certs + - name: db-ca-cert secret: - secretName: {{ .Instance.Name }}-db-tls + secretName: {{ .Instance.Name }}-db-ca defaultMode: 420 {{ end }} From ddd093a20acb147e534eb72e47567e6b9e46a579 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Fri, 18 Oct 2024 13:58:15 +0100 Subject: [PATCH 24/24] Support for VirtualServices for InferenceLogger traffic (#332) * Generate KServe Inference Logger in conformance with DestinationRule and VirtualService * Add VirtualService creation for models in the mesh * Add permissions for VirtualServices * Update manifests for VirtualServices * Fix VirtualServiceName variable --- config/rbac/role.yaml | 12 +++ controllers/inference_services.go | 21 ++++- .../service/virtual-service.tmpl.yaml | 16 ++++ controllers/trustyaiservice_controller.go | 1 + controllers/utils.go | 5 ++ controllers/virtual_service.go | 89 +++++++++++++++++++ 6 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 controllers/templates/service/virtual-service.tmpl.yaml create mode 100644 controllers/virtual_service.go diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 91be669..be51504 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -154,6 +154,18 @@ rules: - patch - update - watch +- apiGroups: + - networking.istio.io + resources: + - virtualservices + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - rbac.authorization.k8s.io resources: diff --git a/controllers/inference_services.go b/controllers/inference_services.go index ee8745b..827ed4f 100644 --- a/controllers/inference_services.go +++ b/controllers/inference_services.go @@ -241,7 +241,7 @@ func (r *TrustyAIServiceReconciler) handleInferenceServices(ctx context.Context, // patchKServe adds a TrustyAI service as an InferenceLogger to a KServe InferenceService func (r *TrustyAIServiceReconciler) patchKServe(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService, infService kservev1beta1.InferenceService, namespace string, crName string, remove bool) error { - url := generateNonTLSServiceURL(crName, namespace) + url := generateKServeLoggerURL(crName, namespace) if remove { if infService.Spec.Predictor.Logger == nil || *infService.Spec.Predictor.Logger.URL != url { @@ -295,6 +295,25 @@ func (r *TrustyAIServiceReconciler) patchKServe(ctx context.Context, instance *t log.FromContext(ctx).Error(err, "InferenceService has service mesh annotation but DestinationRule CRD not found") } + // Check if VirtualService CRD is present. If there's an error, don't proceed and return the error + exists, err = r.isVirtualServiceCRDPresent(ctx) + if err != nil { + log.FromContext(ctx).Error(err, "Error verifying VirtualService CRD is present") + return err + } + + // Try to create the VirtualService, since CRD exists + if exists { + err := r.ensureVirtualService(ctx, instance) + if err != nil { + return fmt.Errorf("failed to ensure VirtualService: %v", err) + } + } else { + // VirtualService CRD does not exist. Do not attempt to create it and log error + err := fmt.Errorf("the VirtualService CRD is not present in this cluster") + log.FromContext(ctx).Error(err, "InferenceService has service mesh annotation but VirtualService CRD not found") + } + } // Update the InferenceService diff --git a/controllers/templates/service/virtual-service.tmpl.yaml b/controllers/templates/service/virtual-service.tmpl.yaml new file mode 100644 index 0000000..8356be9 --- /dev/null +++ b/controllers/templates/service/virtual-service.tmpl.yaml @@ -0,0 +1,16 @@ +apiVersion: networking.istio.io/v1beta1 +kind: VirtualService +metadata: + name: {{ .VirtualServiceName }} + namespace: {{ .Namespace }} +spec: + hosts: + - {{ .Name }}.{{ .Namespace }}.svc.cluster.local + http: + - match: + - port: 80 + route: + - destination: + host: {{ .Name }}.{{ .Namespace }}.svc.cluster.local + port: + number: 443 diff --git a/controllers/trustyaiservice_controller.go b/controllers/trustyaiservice_controller.go index 1176d98..a6ae377 100644 --- a/controllers/trustyaiservice_controller.go +++ b/controllers/trustyaiservice_controller.go @@ -70,6 +70,7 @@ type TrustyAIServiceReconciler struct { //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;watch;create;update;delete //+kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;create;update //+kubebuilder:rbac:groups=networking.istio.io,resources=destinationrules,verbs=create;list;watch;get;update;patch;delete +//+kubebuilder:rbac:groups=networking.istio.io,resources=virtualservices,verbs=create;list;watch;get;update;patch;delete //+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=list;watch;get // Reconcile is part of the main kubernetes reconciliation loop which aims to diff --git a/controllers/utils.go b/controllers/utils.go index e96fc4e..759b388 100644 --- a/controllers/utils.go +++ b/controllers/utils.go @@ -71,3 +71,8 @@ func generateTLSServiceURL(crName string, namespace string) string { func generateNonTLSServiceURL(crName string, namespace string) string { return "http://" + crName + "." + namespace + ".svc" } + +// generateKServeLoggerURL generates an logger url for KServe Inference Loggers +func generateKServeLoggerURL(crName string, namespace string) string { + return "http://" + crName + "." + namespace + ".svc.cluster.local" +} diff --git a/controllers/virtual_service.go b/controllers/virtual_service.go new file mode 100644 index 0000000..f5223c8 --- /dev/null +++ b/controllers/virtual_service.go @@ -0,0 +1,89 @@ +package controllers + +import ( + "context" + "fmt" + "reflect" + + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" + templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +const ( + virtualServiceTemplatePath = "service/virtual-service.tmpl.yaml" + virtualServiceCDRName = "destinationrules.networking.istio.io" +) + +// DestinationRuleConfig has the variables for the DestinationRule template +type VirtualServiceConfig struct { + Name string + Namespace string + VirtualServiceName string +} + +// isVirtualServiceCRDPresent returns true if the DestinationRule CRD is present, false otherwise +func (r *TrustyAIServiceReconciler) isVirtualServiceCRDPresent(ctx context.Context) (bool, error) { + crd := &apiextensionsv1.CustomResourceDefinition{} + + err := r.Get(ctx, types.NamespacedName{Name: virtualServiceCDRName}, crd) + if err != nil { + if !errors.IsNotFound(err) { + return false, fmt.Errorf("error getting "+virtualServiceCDRName+" CRD: %v", err) + } + // Not found + return false, nil + } + + // Found + return true, nil +} + +func (r *TrustyAIServiceReconciler) ensureVirtualService(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error { + + virtualServiceName := instance.Name + "-redirect" + + existingVirtualService := &unstructured.Unstructured{} + existingVirtualService.SetKind("VirtualService") + existingVirtualService.SetAPIVersion("networking.istio.io/v1beta1") + + // Check if the DestinationRule already exists + err := r.Get(ctx, types.NamespacedName{Name: virtualServiceName, Namespace: instance.Namespace}, existingVirtualService) + if err == nil { + // DestinationRule exists + return nil + } + + if !errors.IsNotFound(err) { + return fmt.Errorf("failed to check for existing VirtualService: %v", err) + } + + virtualServiceConfig := VirtualServiceConfig{ + Name: instance.Name, + Namespace: instance.Namespace, + VirtualServiceName: virtualServiceName, + } + + var virtualService *unstructured.Unstructured + virtualService, err = templateParser.ParseResource[unstructured.Unstructured](virtualServiceTemplatePath, virtualServiceConfig, reflect.TypeOf(&unstructured.Unstructured{})) + if err != nil { + log.FromContext(ctx).Error(err, "could not parse the VirtualService template") + return err + } + + if err := ctrl.SetControllerReference(instance, virtualService, r.Scheme); err != nil { + return err + } + + err = r.Create(ctx, virtualService) + if err != nil { + return fmt.Errorf("failed to create VirtualService: %v", err) + } + + return nil +}