From 78d2f94261a6f45f39d694061c4fb2658374d994 Mon Sep 17 00:00:00 2001
From: vincent <vincent@secondstate.io>
Date: Fri, 29 Dec 2023 12:47:06 +0800
Subject: [PATCH] Add k8s with llama server example

Signed-off-by: vincent <vincent@secondstate.io>
---
 .../kubernetes-containerd-llama-server.yml    | 85 +++++++++++++++++++
 README.md                                     |  1 +
 k8s_containerd_llama/README.md                | 41 +++++++++
 .../llama_server_application.sh               | 79 +++++++++++++++++
 4 files changed, 206 insertions(+)
 create mode 100644 .github/workflows/kubernetes-containerd-llama-server.yml
 create mode 100644 k8s_containerd_llama/README.md
 create mode 100755 k8s_containerd_llama/llama_server_application.sh

diff --git a/.github/workflows/kubernetes-containerd-llama-server.yml b/.github/workflows/kubernetes-containerd-llama-server.yml
new file mode 100644
index 0000000..0ef75b4
--- /dev/null
+++ b/.github/workflows/kubernetes-containerd-llama-server.yml
@@ -0,0 +1,85 @@
+name: k8s containerd LLAMA service test
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
+  cancel-in-progress: true
+
+on:
+  workflow_dispatch:
+    inputs:
+      logLevel:
+        description: 'Log level'
+        required: true
+        default: 'info'
+  push:
+    branches: [ main ]
+    paths-ignore:
+      - '**/README.md'
+  pull_request:
+    branches: [ main ]
+    paths-ignore:
+      - '**/README.md'
+  schedule:
+    - cron: "0 0 */1 * *"
+
+jobs:
+  run:
+    runs-on: ubuntu-20.04
+    name: Run ggml plugin example
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Install apt-get packages
+        run: |
+          sudo ACCEPT_EULA=Y apt-get update
+          sudo ACCEPT_EULA=Y apt-get upgrade
+          sudo ACCEPT_EULA=Y apt-get install git wget jq
+
+      - name: Install containerd, WasmEdge, and crun with supprt of plugins and nn-preoload
+        run: |
+          sed 's|https://github.com/containers/crun|-b enable-wasmedge-plugin https://github.com/second-state/crun|g' containerd/install.sh | bash
+
+      - name: Installing and starting k8s
+        run: |
+          bash kubernetes_containerd/install.sh > k8s.log 2>&1
+
+      - name: Installing wasi_nn-ggml plugin and copy sys's dependencies into same path for container environment
+        run: |
+          curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh | bash -s -- --plugins wasi_nn-ggml
+          wget -qO- https://raw.githubusercontent.com/second-state/runwasi/main/release/utils/copy_sys_dependencies.sh | bash -s $HOME/.wasmedge/plugin/libwasmedgePluginWasiNN.so $HOME/.wasmedge/plugin/
+
+      - name: Download llm model
+        run: |
+          curl -LO https://huggingface.co/second-state/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_M.gguf
+
+      - name: Sleep for 1200s
+        run: sleep 1200s
+        shell: bash
+
+      - name: Dump the log of k8s setup
+        run: |
+          cat k8s.log
+
+      - name: Run llm api service in k8s
+        continue-on-error: true
+        run: |
+          bash k8s_containerd_llama/llama_server_application.sh >> dump.log 2>&1
+
+      - name: Test API server pod was created using the kubectl run command.
+        continue-on-error: true
+        run: |
+          curl -X POST http://localhost:8080/v1/models -H 'accept:application/json'
+          curl -X POST http://localhost:8080/v1/chat/completions -H 'accept:application/json' -H 'Content-Type: application/json' -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."}, {"role":"user", "content": "Who is Robert Oppenheimer?"}], "model":"llama-2-chat"}' | jq .
+          curl -X POST http://localhost:8080/v1/chat/completions -H 'accept:application/json' -H 'Content-Type: application/json' -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."}, {"role":"user", "content": "What new discoveries from the James Webb Space Telescope can I tell my nine-year-old about?"}], "model":"llama-2-chat"}' | jq .
+
+      - name: Display crun and wasmedge version
+        run: |
+          crun --version
+          wasmedge --version
+
+      - name: Dump the log of execution
+        run: |
+          cat dump.log
\ No newline at end of file
diff --git a/README.md b/README.md
index 3fd352f..483850c 100644
--- a/README.md
+++ b/README.md
@@ -20,3 +20,4 @@ how WasmEdge applications work side by side with Linux containers.
 * Containerd [Quick start](containerd/http_server/README.md) | [Github Action](.github/workflows/containerd-server.yml) | [Successful run](https://github.com/second-state/wasmedge-containers-examples/runs/4328916842?check_suite_focus=true#step:4:86)
 * Kubernetes + CRI-O [Quick start](kubernetes_crio/http_server/README.md) | [Github Action](.github/workflows/kubernetes-crio-server.yml) | [Successful run](https://github.com/second-state/wasmedge-containers-examples/runs/4577323886?check_suite_focus=true#step:6:3041)
 * Kubernetes + containerd [Quick start](kubernetes_containerd/http_server/README.md) | [Github Action](.github/workflows/kubernetes-containerd-server.yml) | [Successful run](https://github.com/second-state/wasmedge-containers-examples/runs/4577323891?check_suite_focus=true#step:6:3013)
+* Kubernetes + containerd + llama [Quick start](k8s_containerd_llama/README.md) | [Github Action](.github/workflows/kubernetes-containerd-llama-server.yml) | [Successful run](https://github.com/second-state/wasmedge-containers-examples/actions)
\ No newline at end of file
diff --git a/k8s_containerd_llama/README.md b/k8s_containerd_llama/README.md
new file mode 100644
index 0000000..eeb2b8f
--- /dev/null
+++ b/k8s_containerd_llama/README.md
@@ -0,0 +1,41 @@
+# Run a WasmEdge LLAMA chat server app with Containerd over Kubernetes
+
+## Environment
+
+We use `Ubuntu 20.04 x86_64` in the following example.
+
+## Install containerd, costomized crun, and WasmEdge
+
+Reuse install script from other example, but use the experimental crun branch.
+
+```bash
+sed 's|https://github.com/containers/crun|-b enable-wasmedge-plugin https://github.com/second-state/crun|g' containerd/install.sh | bash
+```
+
+## Install k8s
+
+Reuse install script from other example.
+
+```bash
+bash kubernetes_containerd/install.sh
+```
+
+## Run LLAMA chat server app
+The [llama_server_application.sh](./llama_server_application.sh) script shows how to pull a WASM container image with WASI-NN-GGML plugin support from the Docker Hub, and then run it as a containerized application in Kubernetes.
+
+```bash
+bash k8s_containerd_llama/llama_server_application.sh
+```
+
+Test API service from other session
+
+```bash
+curl -X POST http://localhost:8080/v1/chat/completions -H 'accept:application/json' -H 'Content-Type: application/json' -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."}, {"role":"user", "content": "Who is Robert Oppenheimer?"}], "model":"llama-2-chat"}' | jq .
+```
+
+Check output
+
+```bash
+```
+
+[Learn more](https://wasmedge.org/book/en/kubernetes/kubernetes/kubernetes-containerd.html)
diff --git a/k8s_containerd_llama/llama_server_application.sh b/k8s_containerd_llama/llama_server_application.sh
new file mode 100755
index 0000000..a6846ac
--- /dev/null
+++ b/k8s_containerd_llama/llama_server_application.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+set -x # Enable verbose for the debug information
+export KUBERNETES_PROVIDER=local
+export WASM_IMAGE=ghcr.io/second-state/runwasi-demo
+export WASM_IMAGE_TAG=llama-simple
+export VARIANT=compat-smart
+export CLUS_NAME=local
+export CRED_NAME=myself
+export SERVER=https://localhost:6443
+export CERT_AUTH=/var/run/kubernetes/server-ca.crt
+export CLIENT_KEY=/var/run/kubernetes/client-admin.key
+export CLIENT_CERT=/var/run/kubernetes/client-admin.crt
+
+
+sudo ./kubernetes/cluster/kubectl.sh config set-cluster "$CLUS_NAME" --server="$SERVER" --certificate-authority="$CERT_AUTH"
+sudo ./kubernetes/cluster/kubectl.sh config set-credentials $CRED_NAME --client-key="$CLIENT_KEY" --client-certificate="$CLIENT_CERT"
+sudo ./kubernetes/cluster/kubectl.sh config set-context "$CLUS_NAME" --cluster="$CLUS_NAME" --user="$CRED_NAME"
+sudo ./kubernetes/cluster/kubectl.sh config use-context "$CLUS_NAME"
+sudo ./kubernetes/cluster/kubectl.sh cluster-info
+
+sudo ./kubernetes/cluster/kubectl.sh run -i --restart=Never testggml --image=ghcr.io/captainvincent/runwasi-demo:llama-api-server --annotations="module.wasm.image/variant=compat-smart" --overrides='
+{
+  "apiVersion": "v1",
+  "kind": "Pod",
+  "metadata": {
+    "name": "testggml"
+  },
+  "spec": {
+    "hostNetwork": true,
+    "containers": [
+      {
+        "name": "simple",
+        "image": "ghcr.io/captainvincent/runwasi-demo:llama-api-server",
+        "command": ["/app.wasm", "-p", "llama-2-chat"],
+        "stdin": true,
+        "tty": true,
+        "env": [
+          {
+            "name": "WASMEDGE_PLUGIN_PATH",
+            "value": "/opt/containerd/lib"
+          },
+          {
+            "name": "WASMEDGE_WASINN_PRELOAD",
+            "value": "default:GGML:CPU:/resource/llama-2-7b-chat.Q5_K_M.gguf"
+          }
+        ],
+        "volumeMounts": [
+          {
+            "name": "plugins",
+            "mountPath": "/opt/containerd/lib"
+          },
+          {
+            "name": "model",
+            "mountPath": "/resource"
+          }
+        ]
+      }
+    ],
+    "volumes": [
+      {
+        "name": "plugins",
+        "hostPath": {
+          "path": "'"$HOME"'/.wasmedge/plugin/"
+        }
+      },
+      {
+        "name": "model",
+        "hostPath": {
+          "path": "'"$PWD"'"
+        }
+      }
+    ]
+  }
+}'
+
+echo -e "Wait 60s"
+sleep 60
+
+sudo ./kubernetes/cluster/kubectl.sh get pod --all-namespaces -o wide