Add support for AMD MI300X GPU (#368)

* Add an image for `amd-gpu` * Add resourceProfile for AMD MI300X * Add a default values file for AMD GPU operator * Add installation guide section for using AMD GPU operator Benchmark of Llama 3.1 70B on 1 x AMD MI300X: https://substratus.ai/blog/benchmarking-llama-3.1-70b-amd-mi300x
substratusai · Jan 20, 2025 · 2673746 · 2673746
1 parent 063dfff
commit 2673746
Show file tree

Hide file tree

Showing 7 changed files with 148 additions and 6 deletions.
diff --git a/charts/kubeai/values-amd-gpu-device-plugin.yaml b/charts/kubeai/values-amd-gpu-device-plugin.yaml
@@ -0,0 +1,7 @@
+resourceProfiles:
+  amd-gpu-mi300x:
+    nodeSelector:
+      # Source: https://gitlab.freedesktop.org/mesa/drm/-/blob/main/data/amdgpu.ids#L569
+      amd.com/gpu.device-id: 74a1
+      amd.com/gpu.vram: "192G"
+      amd.com/gpu.family: "AI"
diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -48,6 +48,10 @@ modelServers:
       # Source: https://github.com/drikster80/vllm/tree/gh200-docker
       # gh200: "drikster80/vllm-gh200-openai:v0.6.4.post1"
       gh200: "substratusai/vllm-gh200-openai:v0.6.4.post1"
+      # upstream vLLM seems to have broken ROCm support, so we are using a fork from AMD.
+      # Source: https://hub.docker.com/r/rocm/vllm-dev
+      # Source: https://github.com/ROCm/vllm
+      amd-gpu: substratusai/vllm-rocm:nightly_main_20250117
   OLlama:
     images:
       default: "ollama/ollama:latest"
@@ -177,6 +181,15 @@ resourceProfiles:
         operator: "Equal"
         value: "present"
         effect: "NoSchedule"
+  amd-gpu-mi300x:
+    imageName: "amd-gpu"
+    limits:
+      amd.com/gpu: "1"
+    tolerations:
+      - key: "amd.com/gpu"
+        operator: "Equal"
+        value: "present"
+        effect: "NoSchedule"
 
 cacheProfiles: {}
 

diff --git a/charts/models/values.yaml b/charts/models/values.yaml
@@ -193,6 +193,28 @@ catalog:
     # You can also use nvidia-gpu-a100-80gb:8
     resourceProfile: nvidia-gpu-h100:8
     targetRequests: 500
+  llama-3.1-70b-instruct-fp8-mi300x:
+    enabled: false
+    features: [TextGeneration]
+    url: hf://amd/Llama-3.1-70B-Instruct-FP8-KV
+    engine: VLLM
+    env:
+      HIP_FORCE_DEV_KERNARG: "1"
+      NCCL_MIN_NCHANNELS: "112"
+      TORCH_BLAS_PREFER_HIPBLASLT: "1"
+      VLLM_USE_TRITON_FLASH_ATTN: "0"
+    args:
+      - --max-model-len=120000
+      - --max-num-batched-token=120000
+      - --max-num-seqs=1024
+      - --num-scheduler-steps=15
+      - --gpu-memory-utilization=0.9
+      - --disable-log-requests
+      - --kv-cache-dtype=fp8
+      - --enable-chunked-prefill=false
+      - --max-seq-len-to-capture=16384
+    resourceProfile: amd-gpu-mi300x:1
+    targetRequests: 1024
   llama-3.1-70b-instruct-fp8-gh200:
     enabled: false
     features: [TextGeneration]
@@ -225,6 +247,7 @@ catalog:
     resourceProfile: nvidia-gpu-gh200:1
     targetRequests: 50
   llama-3.1-405b-instruct-fp8-a100-80b:
+    enabled: false
     features: [TextGeneration]
     url: hf://neuralmagic/Meta-Llama-3.1-405B-Instruct-FP8
     engine: VLLM
@@ -243,9 +266,30 @@ catalog:
       - --enable-chunked-prefill=false
       - --num-scheduler-steps=8
     targetRequests: 128
-    minReplicas: 1
-    maxReplicas: 1
     resourceProfile: nvidia-gpu-a100-80gb:8
+  llama-3.1-405b-instruct-fp8-mi300x:
+    enabled: false
+    features: [TextGeneration]
+    url: hf://amd/Llama-3.1-405B-Instruct-FP8-KV
+    engine: VLLM
+    env:
+      HIP_FORCE_DEV_KERNARG: "1"
+      NCCL_MIN_NCHANNELS: "112"
+      TORCH_BLAS_PREFER_HIPBLASLT: "1"
+      VLLM_USE_TRITON_FLASH_ATTN: "0"
+    args:
+      - --max-model-len=120000
+      - --max-num-batched-token=120000
+      - --max-num-seqs=1024
+      - --num-scheduler-steps=15
+      - --tensor-parallel-size=8
+      - --gpu-memory-utilization=0.90
+      - --disable-log-requests
+      - --kv-cache-dtype=fp8
+      - --enable-chunked-prefill=false
+      - --max-seq-len-to-capture=16384
+    resourceProfile: amd-gpu-mi300x:8
+    targetRequests: 1024
   llama-3.3-70b-instruct-bf16-gh200:
     enabled: false
     features: [TextGeneration]

diff --git a/docs/installation/any.md b/docs/installation/any.md
@@ -38,9 +38,9 @@ Optionally, inspect the values file to see the default resourceProfiles:
 helm show values kubeai/kubeai > values.yaml
 ```
 
-## Installation using GPUs
+## Installation using NVIDIA GPUs
 
-This section assumes you have a Kubernetes cluster with GPU resources available and
+This section assumes you have a Kubernetes cluster with NVIDIA GPU resources available and
 installed the NVIDIA device plugin that adds GPU information labels to the nodes.
 
 This time we need to use a custom resource profiles that define the nodeSelectors
@@ -65,6 +65,33 @@ helm upgrade --install kubeai kubeai/kubeai \
     --wait
 ```
 
+## Installation using AMD GPUs
+
+This section assumes you have a Kubernetes cluster with AMD GPU resources available and
+installed the AMD device plugin that adds GPU information labels to the nodes.
+
+This time we need to use a custom resource profiles that define the nodeSelectors
+for different GPU types.
+
+Download the values file for the AMD GPU operator:
+
+```bash
+curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-amd-gpu-device-plugin.yaml
+```
+
+You likely will not need to modify the `values-amd-gpu-device-plugin.yaml` file.
+However, do inspect the file to ensure the GPU resourceProfile nodeSelectors match
+the node labels on your nodes.
+
+
+Install KubeAI using the custom resourceProfiles:
+```bash
+helm upgrade --install kubeai kubeai/kubeai \
+    -f values-amd-gpu-device-plugin.yaml \
+    --set secrets.huggingface.token=$HF_TOKEN \
+    --wait
+```
+
 ## Deploying models
 
 Take a look at the following how-to guides to deploy models:

diff --git a/manifests/models/llama-3.1-405b-instruct-fp8-a100-80b.yaml b/manifests/models/llama-3.1-405b-instruct-fp8-a100-80b.yaml
@@ -21,7 +21,5 @@ spec:
     - --num-scheduler-steps=8
   env:
     VLLM_ATTENTION_BACKEND: FLASHINFER
-  minReplicas: 1
-  maxReplicas: 1
   targetRequests: 128
   resourceProfile: nvidia-gpu-a100-80gb:8
diff --git a/manifests/models/llama-3.1-405b-instruct-fp8-mi300x.yaml b/manifests/models/llama-3.1-405b-instruct-fp8-mi300x.yaml
@@ -0,0 +1,27 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: llama-3.1-405b-instruct-fp8-mi300x
+spec:
+  features: [TextGeneration]
+  url: hf://amd/Llama-3.1-405B-Instruct-FP8-KV
+  engine: VLLM
+  args:
+    - --max-model-len=120000
+    - --max-num-batched-token=120000
+    - --max-num-seqs=1024
+    - --num-scheduler-steps=15
+    - --tensor-parallel-size=8
+    - --gpu-memory-utilization=0.90
+    - --disable-log-requests
+    - --kv-cache-dtype=fp8
+    - --enable-chunked-prefill=false
+    - --max-seq-len-to-capture=16384
+  env:
+    HIP_FORCE_DEV_KERNARG: "1"
+    NCCL_MIN_NCHANNELS: "112"
+    TORCH_BLAS_PREFER_HIPBLASLT: "1"
+    VLLM_USE_TRITON_FLASH_ATTN: "0"
+  targetRequests: 1024
+  resourceProfile: amd-gpu-mi300x:8
diff --git a/manifests/models/llama-3.1-70b-instruct-fp8-mi300x.yaml b/manifests/models/llama-3.1-70b-instruct-fp8-mi300x.yaml
@@ -0,0 +1,26 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: llama-3.1-70b-instruct-fp8-mi300x
+spec:
+  features: [TextGeneration]
+  url: hf://amd/Llama-3.1-70B-Instruct-FP8-KV
+  engine: VLLM
+  args:
+    - --max-model-len=120000
+    - --max-num-batched-token=120000
+    - --max-num-seqs=1024
+    - --num-scheduler-steps=15
+    - --gpu-memory-utilization=0.9
+    - --disable-log-requests
+    - --kv-cache-dtype=fp8
+    - --enable-chunked-prefill=false
+    - --max-seq-len-to-capture=16384
+  env:
+    HIP_FORCE_DEV_KERNARG: "1"
+    NCCL_MIN_NCHANNELS: "112"
+    TORCH_BLAS_PREFER_HIPBLASLT: "1"
+    VLLM_USE_TRITON_FLASH_ATTN: "0"
+  targetRequests: 1024
+  resourceProfile: amd-gpu-mi300x:1