From 427d1021eaf4b205591d3fa5331b2ec1f7f5331c Mon Sep 17 00:00:00 2001 From: Yihong Wang Date: Mon, 5 Aug 2024 11:06:15 -0700 Subject: [PATCH] sync: sync dev/lm-eval with main branch (#271) * feat: Initial database support (#246) * Initial database support - Add status checking - Add better storage flags - Add spec.storage.format validation - Add DDL -Add HIBERNATE format to DB (test) - Update service image - Revert identifier to DATABASE - Update CR options (remove mandatory data) * Remove default DDL generation env var * Update service image to latest tag * Add migration awareness * Add updating pods for migration * Change JDBC url from mysql to mariadb * Fix TLS mount * Revert images * Remove redundant logic * Fix comments * feat: Add TLS certificate mount on ModelMesh (#255) * feat: Add TLS certificate mount on ModelMesh * Revert from http to https until https://github.com/kserve/modelmesh/pull/147 is merged * Pin oc version, ubi version (#263) * Restore checkout of trustyai-exp (#265) * Add operator installation robustness (#266) * fix: Skip InferenceService patching for KServe RawDeployment (#262) * feat: ConfigMap key to disable KServe Serverless configuration (#267) * feat: Add support for custom certificates in database connection (#259) * Add TLS endpoint for ModelMesh payload processors. (#268) Keep non-TLS endpoint for KServe Serverless (disabled by default) --------- Signed-off-by: Yihong Wang Co-authored-by: Rui Vieira Co-authored-by: Rob Geada --- config/base/kustomization.yaml | 7 +++ config/base/params.env | 1 + controllers/tas/config_maps.go | 38 +++++++++++ controllers/tas/constants.go | 5 +- controllers/tas/deployment.go | 16 ++++- controllers/tas/inference_services.go | 36 ++++++++--- controllers/tas/secrets.go | 38 ++++++----- .../templates/service/deployment.tmpl.yaml | 17 ++++- tests/Dockerfile | 22 ++----- tests/scripts/install.sh | 63 ++++++++++++------- 10 files changed, 177 insertions(+), 66 deletions(-) diff --git a/config/base/kustomization.yaml b/config/base/kustomization.yaml index ab0b973..f50d41d 100644 --- a/config/base/kustomization.yaml +++ b/config/base/kustomization.yaml @@ -41,3 +41,10 @@ vars: apiVersion: v1 fieldref: fieldpath: data.oauthProxyImage + - name: kServeServerless + objref: + kind: ConfigMap + name: config + apiVersion: v1 + fieldref: + fieldpath: data.kServeServerless diff --git a/config/base/params.env b/config/base/params.env index 68d67aa..a0f5419 100644 --- a/config/base/params.env +++ b/config/base/params.env @@ -1,3 +1,4 @@ trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0 +kServeServerless=disabled \ No newline at end of file diff --git a/controllers/tas/config_maps.go b/controllers/tas/config_maps.go index 5e656ca..e313352 100644 --- a/controllers/tas/config_maps.go +++ b/controllers/tas/config_maps.go @@ -48,6 +48,44 @@ func (r *TrustyAIServiceReconciler) getImageFromConfigMap(ctx context.Context, k } } +// getKServeServerlessConfig checks the kServeServerless value in a ConfigMap in the operator's namespace +func (r *TrustyAIServiceReconciler) getKServeServerlessConfig(ctx context.Context) (bool, error) { + + if r.Namespace != "" { + // Define the key for the ConfigMap + configMapKey := types.NamespacedName{ + Namespace: r.Namespace, + Name: constants.ConfigMap, + } + + // Create an empty ConfigMap object + var cm corev1.ConfigMap + + // Try to get the ConfigMap + if err := r.Get(ctx, configMapKey, &cm); err != nil { + if errors.IsNotFound(err) { + // ConfigMap not found, return false as the default behavior + return false, nil + } + // Other error occurred when trying to fetch the ConfigMap + return false, fmt.Errorf("error reading configmap %s", configMapKey) + } + + // ConfigMap is found, extract the kServeServerless value + kServeServerless, ok := cm.Data[configMapkServeServerlessKey] + + if !ok || kServeServerless != "enabled" { + // Key is missing or its value is not "enabled", return false + return false, nil + } + + // kServeServerless is "enabled" + return true, nil + } else { + return false, nil + } +} + // getConfigMapNamesWithLabel retrieves the names of ConfigMaps that have the specified label func (r *TrustyAIServiceReconciler) getConfigMapNamesWithLabel(ctx context.Context, namespace string, labelSelector client.MatchingLabels) ([]string, error) { configMapList := &corev1.ConfigMapList{} diff --git a/controllers/tas/constants.go b/controllers/tas/constants.go index deafafe..64f3c5b 100644 --- a/controllers/tas/constants.go +++ b/controllers/tas/constants.go @@ -27,8 +27,9 @@ const ( // Configuration constants const ( - configMapOAuthProxyImageKey = "oauthProxyImage" - configMapServiceImageKey = "trustyaiServiceImage" + configMapOAuthProxyImageKey = "oauthProxyImage" + configMapServiceImageKey = "trustyaiServiceImage" + configMapkServeServerlessKey = "kServeServerless" ) // OAuth constants diff --git a/controllers/tas/deployment.go b/controllers/tas/deployment.go index 7119a4c..87629c4 100644 --- a/controllers/tas/deployment.go +++ b/controllers/tas/deployment.go @@ -40,6 +40,7 @@ type DeploymentConfig struct { CustomCertificatesBundle CustomCertificatesBundle Version string BatchSize int + UseDBTLSCerts bool } // createDeploymentObject returns a Deployment for the TrustyAI Service instance @@ -69,7 +70,20 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context, PVCClaimName: pvcName, CustomCertificatesBundle: caBunble, Version: constants.Version, - BatchSize: batchSize, + } + + if instance.Spec.Storage.IsStorageDatabase() { + _, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace) + if err != nil { + deploymentConfig.UseDBTLSCerts = false + log.FromContext(ctx).Error(err, "Using insecure database connection. Certificates "+instance.Name+"-db-tls not found") + } else { + deploymentConfig.UseDBTLSCerts = true + log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls") + } + } else { + deploymentConfig.UseDBTLSCerts = false + log.FromContext(ctx).Info("No need to check database secrets. Using PVC-mode.") } var deployment *appsv1.Deployment diff --git a/controllers/tas/inference_services.go b/controllers/tas/inference_services.go index 854bdbb..bd5f012 100644 --- a/controllers/tas/inference_services.go +++ b/controllers/tas/inference_services.go @@ -15,6 +15,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" ) +const ( + DEPLOYMENT_MODE_MODELMESH = "ModelMesh" + DEPLOYMENT_MODE_RAW = "RawDeployment" + DEPLOYMENT_MODE_SERVERLESS = "Serverless" +) + // GetDeploymentsByLabel returns a list of Deployments that match a label key-value pair func (r *TrustyAIServiceReconciler) GetDeploymentsByLabel(ctx context.Context, namespace string, labelKey string, labelValue string) ([]appsv1.Deployment, error) { // Prepare a DeploymentList object @@ -213,23 +219,37 @@ func (r *TrustyAIServiceReconciler) handleInferenceServices(ctx context.Context, return false, err } + kServeServerlessEnabled, err := r.getKServeServerlessConfig(ctx) + if err != nil { + log.FromContext(ctx).Error(err, "Could not read KServeServerless configuration. Defaulting to disabled") + kServeServerlessEnabled = false + } + if len(inferenceServices.Items) == 0 { return true, nil } for _, infService := range inferenceServices.Items { annotations := infService.GetAnnotations() - // Check the annotation "serving.kserve.io/deploymentMode: ModelMesh" - if val, ok := annotations["serving.kserve.io/deploymentMode"]; ok && val == "ModelMesh" { - shouldContinue, err := r.patchEnvVarsByLabelForDeployments(ctx, instance, namespace, labelKey, labelValue, envVarName, crName, remove) - if err != nil { - log.FromContext(ctx).Error(err, "Could not patch environment variables for ModelMesh deployments.") - return shouldContinue, err + + // Check the annotation "serving.kserve.io/deploymentMode" + if val, ok := annotations["serving.kserve.io/deploymentMode"]; ok { + if val == DEPLOYMENT_MODE_RAW { + log.FromContext(ctx).Info("RawDeployment mode not supported by TrustyAI") + continue + } else if val == DEPLOYMENT_MODE_MODELMESH { + shouldContinue, err := r.patchEnvVarsByLabelForDeployments(ctx, instance, namespace, labelKey, labelValue, envVarName, crName, remove) + if err != nil { + log.FromContext(ctx).Error(err, "could not patch environment variables for ModelMesh deployments") + return shouldContinue, err + } + continue } - } else { + } + if kServeServerlessEnabled { err := r.patchKServe(ctx, instance, infService, namespace, crName, remove) if err != nil { - log.FromContext(ctx).Error(err, "Could not path InferenceLogger for KServe deployment.") + log.FromContext(ctx).Error(err, "could not patch InferenceLogger for KServe deployment") return false, err } } diff --git a/controllers/tas/secrets.go b/controllers/tas/secrets.go index 5a3ad92..a74b884 100644 --- a/controllers/tas/secrets.go +++ b/controllers/tas/secrets.go @@ -10,34 +10,42 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +// getSecret retrieves a secret if it exists, returns an error if not +func (r *TrustyAIServiceReconciler) getSecret(ctx context.Context, name, namespace string) (*corev1.Secret, error) { + secret := &corev1.Secret{} + err := r.Get(ctx, client.ObjectKey{Name: name, Namespace: namespace}, secret) + if err != nil { + if errors.IsNotFound(err) { + return nil, fmt.Errorf("secret %s not found in namespace %s: %w", name, namespace, err) + } + return nil, fmt.Errorf("failed to get secret %s in namespace %s: %w", name, namespace, err) + } + return secret, nil +} + // findDatabaseSecret finds the DB configuration secret named (specified or default) in the same namespace as the CR func (r *TrustyAIServiceReconciler) findDatabaseSecret(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (*corev1.Secret, error) { databaseConfigurationsName := instance.Spec.Storage.DatabaseConfigurations defaultDatabaseConfigurationsName := instance.Name + dbCredentialsSuffix - secret := &corev1.Secret{} - if databaseConfigurationsName != "" { - secret := &corev1.Secret{} - err := r.Get(ctx, client.ObjectKey{Name: databaseConfigurationsName, Namespace: instance.Namespace}, secret) - if err == nil { - return secret, nil + secret, err := r.getSecret(ctx, databaseConfigurationsName, instance.Namespace) + if err != nil { + return nil, err } - if !errors.IsNotFound(err) { - return nil, fmt.Errorf("failed to get secret %s in namespace %s: %w", databaseConfigurationsName, instance.Namespace, err) + if secret != nil { + return secret, nil } } else { // If specified not found, try the default - - err := r.Get(ctx, client.ObjectKey{Name: defaultDatabaseConfigurationsName, Namespace: instance.Namespace}, secret) - if err == nil { - return secret, nil + secret, err := r.getSecret(ctx, defaultDatabaseConfigurationsName, instance.Namespace) + if err != nil { + return nil, err } - if !errors.IsNotFound(err) { - return nil, fmt.Errorf("failed to get secret %s in namespace %s: %w", defaultDatabaseConfigurationsName, instance.Namespace, err) + if secret != nil { + return secret, nil } - } return nil, fmt.Errorf("neither secret %s nor %s found in namespace %s", databaseConfigurationsName, defaultDatabaseConfigurationsName, instance.Namespace) diff --git a/controllers/tas/templates/service/deployment.tmpl.yaml b/controllers/tas/templates/service/deployment.tmpl.yaml index 840421f..fa8a9c7 100644 --- a/controllers/tas/templates/service/deployment.tmpl.yaml +++ b/controllers/tas/templates/service/deployment.tmpl.yaml @@ -94,7 +94,11 @@ spec: name: {{ .Instance.Spec.Storage.DatabaseConfigurations }} key: databasePort - name: QUARKUS_DATASOURCE_JDBC_URL + {{ if .UseDBTLSCerts }} + value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/trustyai_database?sslMode=verify-ca&serverSslCert=/etc/tls/db/tls.crt" + {{ else }} value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/trustyai_database" + {{ end }} - name: SERVICE_DATA_FORMAT value: "HIBERNATE" - name: QUARKUS_DATASOURCE_GENERATION @@ -121,7 +125,12 @@ spec: - name: {{ .VolumeMountName }} mountPath: {{ .Instance.Spec.Storage.Folder }} readOnly: false - {{ end }} + {{ end }} + {{ if .UseDBTLSCerts }} + - name: db-tls-certs + mountPath: /etc/tls/db + readOnly: true + {{ end }} - resources: limits: cpu: 100m @@ -209,3 +218,9 @@ spec: secret: secretName: {{ .Instance.Name }}-internal defaultMode: 420 + {{ if .UseDBTLSCerts }} + - name: db-tls-certs + secret: + secretName: {{ .Instance.Name }}-db-tls + defaultMode: 420 + {{ end }} diff --git a/tests/Dockerfile b/tests/Dockerfile index 9465467..e39ebdf 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -1,18 +1,17 @@ -FROM registry.access.redhat.com/ubi8:8.10-901.1716497712 +FROM registry.access.redhat.com/ubi8:8.10-1020 ARG ORG=trustyai-explainability ARG BRANCH=main ARG ODS_CI_REPO=https://github.com/red-hat-data-services/ods-ci # This git reference should always reference a stable commit from ods-ci that supports ODH # This hash corresponds to a March 24th, 2023 commit -ARG ODS_CI_GITREF=867a617bc224726cf98fa3354293f8e50b4f5eb5 -ARG OC_CLI_URL=https://mirror.openshift.com/pub/openshift-v4/amd64/clients/ocp/latest/openshift-client-linux.tar.gz +ARG ODS_CI_GITREF=a8cf770b37caa4ef7ce6596acc8bdd6866cc7772 +ARG OC_CLI_URL=https://mirror.openshift.com/pub/openshift-v4/amd64/clients/ocp/4.14.33/openshift-client-linux.tar.gz ENV HOME /root WORKDIR /root -RUN dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm &&\ - dnf install -y jq bc git go-toolset python3.11 python3.11-pip python3.11-devel unzip && \ +RUN dnf install -y jq bc git go-toolset python3.11 python3.11-devel python3.11-pip unzip && \ dnf clean all && \ git clone https://github.com/opendatahub-io/peak $HOME/peak && \ cd $HOME/peak && \ @@ -25,9 +24,6 @@ RUN curl -L https://github.com/mikefarah/yq/releases/download/v4.25.1/yq_linux_a RUN mkdir -p $HOME/src && \ cd $HOME/src && \ git clone --depth=1 --branch ${BRANCH} https://github.com/${ORG}/trustyai-explainability && \ - # Clone ods-ci repo at specified git ref for the ODH Dashboard webUI tests - git clone --depth=1 ${ODS_CI_REPO} ods-ci && cd ods-ci && \ - git fetch origin ${ODS_CI_GITREF} && git checkout FETCH_HEAD && \ chmod -R 777 $HOME/src # Use a specific destination file name in case the url download name changes @@ -37,16 +33,6 @@ RUN tar -C /usr/local/bin -xvf $HOME/peak/oc-cli.tar.gz && \ COPY Pipfile Pipfile.lock $HOME/peak/ -RUN pip3 install micropipenv &&\ - ln -s `which pip3` /usr/bin/pip &&\ - cd $HOME/peak &&\ - micropipenv install - -# Install poetry to support the exeuction of ods-ci test framework -RUN curl -sSL https://install.python-poetry.org | python3 - -ENV PATH="${PATH}:$HOME/.local/bin" -RUN cd $HOME/src/ods-ci && poetry install - ## Grab CI scripts from single-source-of-truth RUN mkdir -p $HOME/peak/operator-tests/trustyai-explainability/ &&\ mkdir $HOME/kfdef/ &&\ diff --git a/tests/scripts/install.sh b/tests/scripts/install.sh index 46eb7d2..cc47a54 100755 --- a/tests/scripts/install.sh +++ b/tests/scripts/install.sh @@ -14,14 +14,44 @@ if ! [ -z "${SKIP_OPERATOR_INSTALL}" ]; then ./setup.sh -t ~/peak/operatorsetup 2>&1 else echo "Installing operator from community marketplace" - while [[ $retry -gt 0 ]]; do - # patch bug in peak setup script - sed -i "s/path=\"{.status.channels.*/ | jq '.status.channels | .[0].currentCSVDesc.installModes | map(select(.type == \"AllNamespaces\")) | .[0].supported')/" setup.sh - sed -i "s/csource=.*/echo \$3; csource=\$3/" setup.sh - sed -i 's/installop \$.*/installop \${vals[0]} \${vals[1]} \${vals[3]}/' setup.sh + start_t=$(date +%s) 2>&1 + ready=false 2>&1 + while ! $ready; do + CATALOG_SOURCES=$(oc get catalogsources -n openshift-marketplace 2> /dev/null | grep 'community-operators') + if [ ! -z "${CATALOG_SOURCES}" ]; then + echo $CATALOG_SOURCES + ready=true 2>&1 + else + sleep 10 + fi + if [ $(($(date +%s)-start_t)) -gt 300 ]; then + echo "Marketplace pods never started" + exit 1 + fi + done + + start_t=$(date +%s) 2>&1 + ready=false 2>&1 + while ! $ready; do + MANIFESTS=$(oc get packagemanifests -n openshift-marketplace 2> /dev/null | grep 'opendatahub') + echo $MANIFESTS + if [ ! -z "${MANIFESTS}" ]; then + echo $MANIFESTS + ready=true 2>&1 + else + sleep 10 + fi + if [ $(($(date +%s)-start_t)) -gt 900 ]; then + echo "Package manifests never downloaded" + exit 1 + fi + done + + while [[ $retry -gt 0 ]]; do + ./setup.sh -o ~/peak/operatorsetup\ - ./setup.sh -o ~/peak/operatorsetup + # approve installplans if [ $? -eq 0 ]; then retry=-1 else @@ -31,11 +61,16 @@ else fi retry=$(( retry - 1)) + sleep 30 + echo "Approving Install Plans, if needed" + oc patch installplan $(oc get installplan -n openshift-operators | grep $ODH_VERSION | awk '{print $1}') -n openshift-operators --type merge --patch '{"spec":{"approved":true}}' || true + oc patch installplan $(oc get installplan -n openshift-operators | grep authorino | awk '{print $1}') -n openshift-operators --type merge --patch '{"spec":{"approved":true}}' || true + finished=false 2>&1 start_t=$(date +%s) 2>&1 echo "Verifying installation of ODH operator" while ! $finished; do - if [ ! -z "$(oc get pods -n openshift-operators | grep 'opendatahub-operator-controller-manager' | grep '1/1')" ]; then + if [ ! -z "$(oc get pods -n openshift-operators | grep 'opendatahub-operator-controller-manager' | grep '1/1')" ]; then finished=true 2>&1 else sleep 10 @@ -50,20 +85,6 @@ else done fi -#popd -### Grabbing and applying the patch in the PR we are testing -#pushd ~/src/${REPO_NAME} -#if [ -z "$PULL_NUMBER" ]; then -# echo "No pull number, assuming nightly run" -#else -# if [ $REPO_OWNER == "trustyai-explainability" ]; then -# curl -O -L https://github.com/${REPO_OWNER}/${REPO_NAME}/pull/${PULL_NUMBER}.patch -# echo "Applying followng patch:" -# cat ${PULL_NUMBER}.patch > ${ARTIFACT_DIR}/github-pr-${PULL_NUMBER}.patch -# git apply ${PULL_NUMBER}.patch -# fi -#fi - popd ## Point manifests repo uri in the KFDEF to the manifests in the PR pushd ~/kfdef