From aeda7455cd7e5cf509a0b0678419bd62d8ebd452 Mon Sep 17 00:00:00 2001
From: rcarrata <rcarratalasanchez@gmail.com>
Date: Fri, 27 Sep 2024 21:51:10 +0200
Subject: [PATCH] add inferenceservice, job and sr

---
 .../inference-service-granite-modelcar.yaml   | 38 +++++++++++
 .../ic-shared-llm/job-enable-modelcar.yaml    | 49 +++++++++++++
 .../rbac-job-enable-modelcar.yaml             | 68 +++++++++++++++++++
 ...service-runtime-vllm-granite-modelcar.yaml | 50 ++++++++++++++
 4 files changed, 205 insertions(+)
 create mode 100644 bootstrap/ic-shared-llm/inference-service-granite-modelcar.yaml
 create mode 100644 bootstrap/ic-shared-llm/job-enable-modelcar.yaml
 create mode 100644 bootstrap/ic-shared-llm/rbac-job-enable-modelcar.yaml
 create mode 100644 bootstrap/ic-shared-llm/service-runtime-vllm-granite-modelcar.yaml

diff --git a/bootstrap/ic-shared-llm/inference-service-granite-modelcar.yaml b/bootstrap/ic-shared-llm/inference-service-granite-modelcar.yaml
new file mode 100644
index 00000000..b4817bc0
--- /dev/null
+++ b/bootstrap/ic-shared-llm/inference-service-granite-modelcar.yaml
@@ -0,0 +1,38 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  annotations:
+    openshift.io/display-name: granite-7b-instruct
+    serving.knative.openshift.io/enablePassthrough: 'true'
+    sidecar.istio.io/inject: 'true'
+    sidecar.istio.io/rewriteAppHTTPProbers: 'true'
+    argocd.argoproj.io/sync-wave: "2"
+    argocd.argoproj.io/compare-options: IgnoreExtraneous
+    argocd.argoproj.io/sync-options: Prune=false
+  name: granite-7b-instruct
+  namespace: ic-shared-llm
+  labels:
+    opendatahub.io/dashboard: 'true'
+spec:
+  predictor:
+    maxReplicas: 1
+    minReplicas: 1
+    model:
+      modelFormat:
+        name: vLLM
+      name: ''
+      resources:
+        limits:
+          cpu: '6'
+          memory: 24Gi
+          nvidia.com/gpu: '1'
+        requests:
+          cpu: '1'
+          memory: 8Gi
+          nvidia.com/gpu: '1'
+      runtime: vllm
+      storageUri: oci://quay.io/rh-aiservices-bu/granite-7b-instruct-modelcar:0.1
+    tolerations:
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
\ No newline at end of file
diff --git a/bootstrap/ic-shared-llm/job-enable-modelcar.yaml b/bootstrap/ic-shared-llm/job-enable-modelcar.yaml
new file mode 100644
index 00000000..7a90611c
--- /dev/null
+++ b/bootstrap/ic-shared-llm/job-enable-modelcar.yaml
@@ -0,0 +1,49 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: patch-inferenceservice-config
+  namespace: ic-shared-llm
+annotations:
+  argocd.argoproj.io/sync-wave: "1"
+  argocd.argoproj.io/hook: Sync
+  argocd.argoproj.io/hook-delete-policy: HookSucceeded
+spec:
+  backoffLimit: 4
+  template:
+    spec:
+      serviceAccount: modelcar-enable-sa
+      serviceAccountName: modelcar-enable-sa
+      containers:
+      - name: patch-configmap
+        image: registry.redhat.io/openshift4/ose-cli:v4.15.0
+        command: ["/bin/sh", "-c"]
+        args:
+          - |
+           # Wait for the operator to be in "Ready" state
+            echo "Waiting for the operator to be Ready..."
+            until [ "$(oc get dsci -n redhat-ods-applications default-dsci -o jsonpath='{.status.phase}')" = "Ready" ]; do
+              echo "Operator not ready, retrying in 10s..."
+              sleep 10
+            done
+            echo "Operator is Ready!"
+
+            # Fetch current storageInitializer config
+            config=$(oc get configmap inferenceservice-config -n redhat-ods-applications -o jsonpath='{.data.storageInitializer}')
+
+            # Check if "enableModelcar" is already enabled
+            if echo "$config" | grep '"enableModelcar": false'; then
+              echo "Patching configmap to enable modelcar..."
+              
+              # Modify the config to enable modelcar using sed
+              newValue=$(echo "$config" | sed 's/"enableModelcar": false/"enableModelcar": true/')
+              newValueEscaped=$(echo "$newValue" | sed 's/\"/\\\"/g')
+
+              # Patch the configmap with the new value
+              oc patch configmap inferenceservice-config -n redhat-ods-applications --type='json' -p "[{\"op\": \"replace\", \"path\": \"/data/storageInitializer\", \"value\": \"$newValueEscaped\"}]"
+            else
+              echo "Modelcar is already enabled, no patching needed."
+            fi
+
+            # Restart the KServe controller to apply changes
+            oc delete pod -n redhat-ods-applications -l control-plane=kserve-controller-manager
+      restartPolicy: OnFailure
diff --git a/bootstrap/ic-shared-llm/rbac-job-enable-modelcar.yaml b/bootstrap/ic-shared-llm/rbac-job-enable-modelcar.yaml
new file mode 100644
index 00000000..33c1e457
--- /dev/null
+++ b/bootstrap/ic-shared-llm/rbac-job-enable-modelcar.yaml
@@ -0,0 +1,68 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: modelcar-enable-sa
+  namespace: ic-shared-llm
+  annotations:
+    argocd.argoproj.io/sync-wave: "0"
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: modelcar-enable-patch-role
+  namespace: redhat-ods-applications
+  annotations:
+    argocd.argoproj.io/sync-wave: "0"
+rules:
+- apiGroups: ["redhat.com"]
+  resources: ["dsci"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+  resources: ["configmaps"]
+  verbs: ["get", "patch"]
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "list", "delete"] 
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: modelcar-enable-patch-rolebinding
+  namespace: redhat-ods-applications
+  annotations:
+    argocd.argoproj.io/sync-wave: "0"
+subjects:
+- kind: ServiceAccount
+  name: modelcar-enable-sa
+  namespace: ic-shared-llm 
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: modelcar-enable-patch-role  # Fixed to bind the correct Role
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: modelcar-dsc-read
+  annotations:
+    argocd.argoproj.io/sync-wave: "0"
+rules:
+- apiGroups: ["dscinitialization.opendatahub.io"]
+  resources: ["dscinitializations"]
+  verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: modelcar-dsc-read-binding
+  annotations:
+    argocd.argoproj.io/sync-wave: "0"
+subjects:
+- kind: ServiceAccount
+  name: modelcar-enable-sa
+  namespace: ic-shared-llm 
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: modelcar-dsc-read
diff --git a/bootstrap/ic-shared-llm/service-runtime-vllm-granite-modelcar.yaml b/bootstrap/ic-shared-llm/service-runtime-vllm-granite-modelcar.yaml
new file mode 100644
index 00000000..26c01bf1
--- /dev/null
+++ b/bootstrap/ic-shared-llm/service-runtime-vllm-granite-modelcar.yaml
@@ -0,0 +1,50 @@
+---
+apiVersion: serving.kserve.io/v1alpha1
+kind: ServingRuntime
+metadata:
+  annotations:
+    opendatahub.io/accelerator-name: migrated-gpu
+    opendatahub.io/apiProtocol: REST
+    opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]'
+    opendatahub.io/template-display-name: vLLM ServingRuntime for KServe
+    opendatahub.io/template-name: vllm-runtime
+    openshift.io/display-name: vllm
+    argocd.argoproj.io/sync-wave: "2"
+  name: vllm
+  namespace: ic-shared-llm
+  labels:
+    opendatahub.io/dashboard: 'true'
+spec:
+  annotations:
+    prometheus.io/path: /metrics
+    prometheus.io/port: '8080'
+  containers:
+    - args:
+        - '--port=8080'
+        - '--model=/mnt/models'
+        - '--served-model-name={{.Name}}'
+        - '--distributed-executor-backend=mp'
+      command:
+        - python
+        - '-m'
+        - vllm.entrypoints.openai.api_server
+      env:
+        - name: HF_HOME
+          value: /tmp/hf_home
+      image: 'quay.io/modh/vllm@sha256:b51fde66f162f1a78e8c027320dddf214732d5345953b1599a84fe0f0168c619'
+      name: kserve-container
+      ports:
+        - containerPort: 8080
+          protocol: TCP
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: shm
+  multiModel: false
+  supportedModelFormats:
+    - autoSelect: true
+      name: vLLM
+  volumes:
+    - emptyDir:
+        medium: Memory
+        sizeLimit: 2Gi
+      name: shm
\ No newline at end of file