From aeda7455cd7e5cf509a0b0678419bd62d8ebd452 Mon Sep 17 00:00:00 2001 From: rcarrata Date: Fri, 27 Sep 2024 21:51:10 +0200 Subject: [PATCH] add inferenceservice, job and sr --- .../inference-service-granite-modelcar.yaml | 38 +++++++++++ .../ic-shared-llm/job-enable-modelcar.yaml | 49 +++++++++++++ .../rbac-job-enable-modelcar.yaml | 68 +++++++++++++++++++ ...service-runtime-vllm-granite-modelcar.yaml | 50 ++++++++++++++ 4 files changed, 205 insertions(+) create mode 100644 bootstrap/ic-shared-llm/inference-service-granite-modelcar.yaml create mode 100644 bootstrap/ic-shared-llm/job-enable-modelcar.yaml create mode 100644 bootstrap/ic-shared-llm/rbac-job-enable-modelcar.yaml create mode 100644 bootstrap/ic-shared-llm/service-runtime-vllm-granite-modelcar.yaml diff --git a/bootstrap/ic-shared-llm/inference-service-granite-modelcar.yaml b/bootstrap/ic-shared-llm/inference-service-granite-modelcar.yaml new file mode 100644 index 00000000..b4817bc0 --- /dev/null +++ b/bootstrap/ic-shared-llm/inference-service-granite-modelcar.yaml @@ -0,0 +1,38 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + annotations: + openshift.io/display-name: granite-7b-instruct + serving.knative.openshift.io/enablePassthrough: 'true' + sidecar.istio.io/inject: 'true' + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + argocd.argoproj.io/sync-wave: "2" + argocd.argoproj.io/compare-options: IgnoreExtraneous + argocd.argoproj.io/sync-options: Prune=false + name: granite-7b-instruct + namespace: ic-shared-llm + labels: + opendatahub.io/dashboard: 'true' +spec: + predictor: + maxReplicas: 1 + minReplicas: 1 + model: + modelFormat: + name: vLLM + name: '' + resources: + limits: + cpu: '6' + memory: 24Gi + nvidia.com/gpu: '1' + requests: + cpu: '1' + memory: 8Gi + nvidia.com/gpu: '1' + runtime: vllm + storageUri: oci://quay.io/rh-aiservices-bu/granite-7b-instruct-modelcar:0.1 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists \ No newline at end of file diff --git a/bootstrap/ic-shared-llm/job-enable-modelcar.yaml b/bootstrap/ic-shared-llm/job-enable-modelcar.yaml new file mode 100644 index 00000000..7a90611c --- /dev/null +++ b/bootstrap/ic-shared-llm/job-enable-modelcar.yaml @@ -0,0 +1,49 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: patch-inferenceservice-config + namespace: ic-shared-llm +annotations: + argocd.argoproj.io/sync-wave: "1" + argocd.argoproj.io/hook: Sync + argocd.argoproj.io/hook-delete-policy: HookSucceeded +spec: + backoffLimit: 4 + template: + spec: + serviceAccount: modelcar-enable-sa + serviceAccountName: modelcar-enable-sa + containers: + - name: patch-configmap + image: registry.redhat.io/openshift4/ose-cli:v4.15.0 + command: ["/bin/sh", "-c"] + args: + - | + # Wait for the operator to be in "Ready" state + echo "Waiting for the operator to be Ready..." + until [ "$(oc get dsci -n redhat-ods-applications default-dsci -o jsonpath='{.status.phase}')" = "Ready" ]; do + echo "Operator not ready, retrying in 10s..." + sleep 10 + done + echo "Operator is Ready!" + + # Fetch current storageInitializer config + config=$(oc get configmap inferenceservice-config -n redhat-ods-applications -o jsonpath='{.data.storageInitializer}') + + # Check if "enableModelcar" is already enabled + if echo "$config" | grep '"enableModelcar": false'; then + echo "Patching configmap to enable modelcar..." + + # Modify the config to enable modelcar using sed + newValue=$(echo "$config" | sed 's/"enableModelcar": false/"enableModelcar": true/') + newValueEscaped=$(echo "$newValue" | sed 's/\"/\\\"/g') + + # Patch the configmap with the new value + oc patch configmap inferenceservice-config -n redhat-ods-applications --type='json' -p "[{\"op\": \"replace\", \"path\": \"/data/storageInitializer\", \"value\": \"$newValueEscaped\"}]" + else + echo "Modelcar is already enabled, no patching needed." + fi + + # Restart the KServe controller to apply changes + oc delete pod -n redhat-ods-applications -l control-plane=kserve-controller-manager + restartPolicy: OnFailure diff --git a/bootstrap/ic-shared-llm/rbac-job-enable-modelcar.yaml b/bootstrap/ic-shared-llm/rbac-job-enable-modelcar.yaml new file mode 100644 index 00000000..33c1e457 --- /dev/null +++ b/bootstrap/ic-shared-llm/rbac-job-enable-modelcar.yaml @@ -0,0 +1,68 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: modelcar-enable-sa + namespace: ic-shared-llm + annotations: + argocd.argoproj.io/sync-wave: "0" +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: modelcar-enable-patch-role + namespace: redhat-ods-applications + annotations: + argocd.argoproj.io/sync-wave: "0" +rules: +- apiGroups: ["redhat.com"] + resources: ["dsci"] + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "patch"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: modelcar-enable-patch-rolebinding + namespace: redhat-ods-applications + annotations: + argocd.argoproj.io/sync-wave: "0" +subjects: +- kind: ServiceAccount + name: modelcar-enable-sa + namespace: ic-shared-llm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: modelcar-enable-patch-role # Fixed to bind the correct Role +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: modelcar-dsc-read + annotations: + argocd.argoproj.io/sync-wave: "0" +rules: +- apiGroups: ["dscinitialization.opendatahub.io"] + resources: ["dscinitializations"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: modelcar-dsc-read-binding + annotations: + argocd.argoproj.io/sync-wave: "0" +subjects: +- kind: ServiceAccount + name: modelcar-enable-sa + namespace: ic-shared-llm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: modelcar-dsc-read diff --git a/bootstrap/ic-shared-llm/service-runtime-vllm-granite-modelcar.yaml b/bootstrap/ic-shared-llm/service-runtime-vllm-granite-modelcar.yaml new file mode 100644 index 00000000..26c01bf1 --- /dev/null +++ b/bootstrap/ic-shared-llm/service-runtime-vllm-granite-modelcar.yaml @@ -0,0 +1,50 @@ +--- +apiVersion: serving.kserve.io/v1alpha1 +kind: ServingRuntime +metadata: + annotations: + opendatahub.io/accelerator-name: migrated-gpu + opendatahub.io/apiProtocol: REST + opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]' + opendatahub.io/template-display-name: vLLM ServingRuntime for KServe + opendatahub.io/template-name: vllm-runtime + openshift.io/display-name: vllm + argocd.argoproj.io/sync-wave: "2" + name: vllm + namespace: ic-shared-llm + labels: + opendatahub.io/dashboard: 'true' +spec: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: '8080' + containers: + - args: + - '--port=8080' + - '--model=/mnt/models' + - '--served-model-name={{.Name}}' + - '--distributed-executor-backend=mp' + command: + - python + - '-m' + - vllm.entrypoints.openai.api_server + env: + - name: HF_HOME + value: /tmp/hf_home + image: 'quay.io/modh/vllm@sha256:b51fde66f162f1a78e8c027320dddf214732d5345953b1599a84fe0f0168c619' + name: kserve-container + ports: + - containerPort: 8080 + protocol: TCP + volumeMounts: + - mountPath: /dev/shm + name: shm + multiModel: false + supportedModelFormats: + - autoSelect: true + name: vLLM + volumes: + - emptyDir: + medium: Memory + sizeLimit: 2Gi + name: shm \ No newline at end of file