From 78d2f94261a6f45f39d694061c4fb2658374d994 Mon Sep 17 00:00:00 2001 From: vincent Date: Fri, 29 Dec 2023 12:47:06 +0800 Subject: [PATCH] Add k8s with llama server example Signed-off-by: vincent --- .../kubernetes-containerd-llama-server.yml | 85 +++++++++++++++++++ README.md | 1 + k8s_containerd_llama/README.md | 41 +++++++++ .../llama_server_application.sh | 79 +++++++++++++++++ 4 files changed, 206 insertions(+) create mode 100644 .github/workflows/kubernetes-containerd-llama-server.yml create mode 100644 k8s_containerd_llama/README.md create mode 100755 k8s_containerd_llama/llama_server_application.sh diff --git a/.github/workflows/kubernetes-containerd-llama-server.yml b/.github/workflows/kubernetes-containerd-llama-server.yml new file mode 100644 index 0000000..0ef75b4 --- /dev/null +++ b/.github/workflows/kubernetes-containerd-llama-server.yml @@ -0,0 +1,85 @@ +name: k8s containerd LLAMA service test + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} + cancel-in-progress: true + +on: + workflow_dispatch: + inputs: + logLevel: + description: 'Log level' + required: true + default: 'info' + push: + branches: [ main ] + paths-ignore: + - '**/README.md' + pull_request: + branches: [ main ] + paths-ignore: + - '**/README.md' + schedule: + - cron: "0 0 */1 * *" + +jobs: + run: + runs-on: ubuntu-20.04 + name: Run ggml plugin example + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Install apt-get packages + run: | + sudo ACCEPT_EULA=Y apt-get update + sudo ACCEPT_EULA=Y apt-get upgrade + sudo ACCEPT_EULA=Y apt-get install git wget jq + + - name: Install containerd, WasmEdge, and crun with supprt of plugins and nn-preoload + run: | + sed 's|https://github.com/containers/crun|-b enable-wasmedge-plugin https://github.com/second-state/crun|g' containerd/install.sh | bash + + - name: Installing and starting k8s + run: | + bash kubernetes_containerd/install.sh > k8s.log 2>&1 + + - name: Installing wasi_nn-ggml plugin and copy sys's dependencies into same path for container environment + run: | + curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh | bash -s -- --plugins wasi_nn-ggml + wget -qO- https://raw.githubusercontent.com/second-state/runwasi/main/release/utils/copy_sys_dependencies.sh | bash -s $HOME/.wasmedge/plugin/libwasmedgePluginWasiNN.so $HOME/.wasmedge/plugin/ + + - name: Download llm model + run: | + curl -LO https://huggingface.co/second-state/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_M.gguf + + - name: Sleep for 1200s + run: sleep 1200s + shell: bash + + - name: Dump the log of k8s setup + run: | + cat k8s.log + + - name: Run llm api service in k8s + continue-on-error: true + run: | + bash k8s_containerd_llama/llama_server_application.sh >> dump.log 2>&1 + + - name: Test API server pod was created using the kubectl run command. + continue-on-error: true + run: | + curl -X POST http://localhost:8080/v1/models -H 'accept:application/json' + curl -X POST http://localhost:8080/v1/chat/completions -H 'accept:application/json' -H 'Content-Type: application/json' -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."}, {"role":"user", "content": "Who is Robert Oppenheimer?"}], "model":"llama-2-chat"}' | jq . + curl -X POST http://localhost:8080/v1/chat/completions -H 'accept:application/json' -H 'Content-Type: application/json' -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."}, {"role":"user", "content": "What new discoveries from the James Webb Space Telescope can I tell my nine-year-old about?"}], "model":"llama-2-chat"}' | jq . + + - name: Display crun and wasmedge version + run: | + crun --version + wasmedge --version + + - name: Dump the log of execution + run: | + cat dump.log \ No newline at end of file diff --git a/README.md b/README.md index 3fd352f..483850c 100644 --- a/README.md +++ b/README.md @@ -20,3 +20,4 @@ how WasmEdge applications work side by side with Linux containers. * Containerd [Quick start](containerd/http_server/README.md) | [Github Action](.github/workflows/containerd-server.yml) | [Successful run](https://github.com/second-state/wasmedge-containers-examples/runs/4328916842?check_suite_focus=true#step:4:86) * Kubernetes + CRI-O [Quick start](kubernetes_crio/http_server/README.md) | [Github Action](.github/workflows/kubernetes-crio-server.yml) | [Successful run](https://github.com/second-state/wasmedge-containers-examples/runs/4577323886?check_suite_focus=true#step:6:3041) * Kubernetes + containerd [Quick start](kubernetes_containerd/http_server/README.md) | [Github Action](.github/workflows/kubernetes-containerd-server.yml) | [Successful run](https://github.com/second-state/wasmedge-containers-examples/runs/4577323891?check_suite_focus=true#step:6:3013) +* Kubernetes + containerd + llama [Quick start](k8s_containerd_llama/README.md) | [Github Action](.github/workflows/kubernetes-containerd-llama-server.yml) | [Successful run](https://github.com/second-state/wasmedge-containers-examples/actions) \ No newline at end of file diff --git a/k8s_containerd_llama/README.md b/k8s_containerd_llama/README.md new file mode 100644 index 0000000..eeb2b8f --- /dev/null +++ b/k8s_containerd_llama/README.md @@ -0,0 +1,41 @@ +# Run a WasmEdge LLAMA chat server app with Containerd over Kubernetes + +## Environment + +We use `Ubuntu 20.04 x86_64` in the following example. + +## Install containerd, costomized crun, and WasmEdge + +Reuse install script from other example, but use the experimental crun branch. + +```bash +sed 's|https://github.com/containers/crun|-b enable-wasmedge-plugin https://github.com/second-state/crun|g' containerd/install.sh | bash +``` + +## Install k8s + +Reuse install script from other example. + +```bash +bash kubernetes_containerd/install.sh +``` + +## Run LLAMA chat server app +The [llama_server_application.sh](./llama_server_application.sh) script shows how to pull a WASM container image with WASI-NN-GGML plugin support from the Docker Hub, and then run it as a containerized application in Kubernetes. + +```bash +bash k8s_containerd_llama/llama_server_application.sh +``` + +Test API service from other session + +```bash +curl -X POST http://localhost:8080/v1/chat/completions -H 'accept:application/json' -H 'Content-Type: application/json' -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."}, {"role":"user", "content": "Who is Robert Oppenheimer?"}], "model":"llama-2-chat"}' | jq . +``` + +Check output + +```bash +``` + +[Learn more](https://wasmedge.org/book/en/kubernetes/kubernetes/kubernetes-containerd.html) diff --git a/k8s_containerd_llama/llama_server_application.sh b/k8s_containerd_llama/llama_server_application.sh new file mode 100755 index 0000000..a6846ac --- /dev/null +++ b/k8s_containerd_llama/llama_server_application.sh @@ -0,0 +1,79 @@ +#!/bin/bash +set -x # Enable verbose for the debug information +export KUBERNETES_PROVIDER=local +export WASM_IMAGE=ghcr.io/second-state/runwasi-demo +export WASM_IMAGE_TAG=llama-simple +export VARIANT=compat-smart +export CLUS_NAME=local +export CRED_NAME=myself +export SERVER=https://localhost:6443 +export CERT_AUTH=/var/run/kubernetes/server-ca.crt +export CLIENT_KEY=/var/run/kubernetes/client-admin.key +export CLIENT_CERT=/var/run/kubernetes/client-admin.crt + + +sudo ./kubernetes/cluster/kubectl.sh config set-cluster "$CLUS_NAME" --server="$SERVER" --certificate-authority="$CERT_AUTH" +sudo ./kubernetes/cluster/kubectl.sh config set-credentials $CRED_NAME --client-key="$CLIENT_KEY" --client-certificate="$CLIENT_CERT" +sudo ./kubernetes/cluster/kubectl.sh config set-context "$CLUS_NAME" --cluster="$CLUS_NAME" --user="$CRED_NAME" +sudo ./kubernetes/cluster/kubectl.sh config use-context "$CLUS_NAME" +sudo ./kubernetes/cluster/kubectl.sh cluster-info + +sudo ./kubernetes/cluster/kubectl.sh run -i --restart=Never testggml --image=ghcr.io/captainvincent/runwasi-demo:llama-api-server --annotations="module.wasm.image/variant=compat-smart" --overrides=' +{ + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "name": "testggml" + }, + "spec": { + "hostNetwork": true, + "containers": [ + { + "name": "simple", + "image": "ghcr.io/captainvincent/runwasi-demo:llama-api-server", + "command": ["/app.wasm", "-p", "llama-2-chat"], + "stdin": true, + "tty": true, + "env": [ + { + "name": "WASMEDGE_PLUGIN_PATH", + "value": "/opt/containerd/lib" + }, + { + "name": "WASMEDGE_WASINN_PRELOAD", + "value": "default:GGML:CPU:/resource/llama-2-7b-chat.Q5_K_M.gguf" + } + ], + "volumeMounts": [ + { + "name": "plugins", + "mountPath": "/opt/containerd/lib" + }, + { + "name": "model", + "mountPath": "/resource" + } + ] + } + ], + "volumes": [ + { + "name": "plugins", + "hostPath": { + "path": "'"$HOME"'/.wasmedge/plugin/" + } + }, + { + "name": "model", + "hostPath": { + "path": "'"$PWD"'" + } + } + ] + } +}' + +echo -e "Wait 60s" +sleep 60 + +sudo ./kubernetes/cluster/kubectl.sh get pod --all-namespaces -o wide