add model config for Deepseek R1 (#371)

substratusai · Jan 22, 2025 · 7aff5aa · 7aff5aa
1 parent 2673746
commit 7aff5aa
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 1 deletion.
diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -51,7 +51,7 @@ modelServers:
       # upstream vLLM seems to have broken ROCm support, so we are using a fork from AMD.
       # Source: https://hub.docker.com/r/rocm/vllm-dev
       # Source: https://github.com/ROCm/vllm
-      amd-gpu: substratusai/vllm-rocm:nightly_main_20250117
+      amd-gpu: substratusai/vllm-rocm:nightly_main_20250120
   OLlama:
     images:
       default: "ollama/ollama:latest"

diff --git a/charts/models/values.yaml b/charts/models/values.yaml
@@ -307,6 +307,33 @@ catalog:
       - --disable-log-requests
     resourceProfile: nvidia-gpu-gh200:1
     targetRequests: 200
+  deepseek-r1-mi300x:
+    enabled: false
+    features: [TextGeneration]
+    url: hf://deepseek-ai/DeepSeek-R1
+    engine: VLLM
+    env:
+      HIP_FORCE_DEV_KERNARG: "1"
+      NCCL_MIN_NCHANNELS: "112"
+      TORCH_BLAS_PREFER_HIPBLASLT: "1"
+      VLLM_USE_TRITON_FLASH_ATTN: "0"
+      VLLM_FP8_PADDING: "0"
+    args:
+      - --trust-remote-code
+      # Currently only context length =< 32k supported.
+      # See: https://github.com/ROCm/vllm/issues/375
+      - --max-model-len=32768
+      - --max-num-batched-token=32768
+      - --max-num-seqs=1024
+      - --num-scheduler-steps=10
+      - --tensor-parallel-size=8
+      - --gpu-memory-utilization=0.90
+      - --disable-log-requests
+      - --enable-chunked-prefill=false
+      - --max-seq-len-to-capture=16384
+      - --kv-cache-dtype=fp8
+    resourceProfile: amd-gpu-mi300x:8
+    targetRequests: 1024
   nomic-embed-text-cpu:
     enabled: false
     features: ["TextEmbedding"]

diff --git a/manifests/models/deepseek-r1-mi300x.yaml b/manifests/models/deepseek-r1-mi300x.yaml
@@ -0,0 +1,29 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: deepseek-r1-mi300x
+spec:
+  features: [TextGeneration]
+  url: hf://deepseek-ai/DeepSeek-R1
+  engine: VLLM
+  args:
+    - --trust-remote-code
+    - --max-model-len=32768
+    - --max-num-batched-token=32768
+    - --max-num-seqs=1024
+    - --num-scheduler-steps=10
+    - --tensor-parallel-size=8
+    - --gpu-memory-utilization=0.90
+    - --disable-log-requests
+    - --enable-chunked-prefill=false
+    - --max-seq-len-to-capture=16384
+    - --kv-cache-dtype=fp8
+  env:
+    HIP_FORCE_DEV_KERNARG: "1"
+    NCCL_MIN_NCHANNELS: "112"
+    TORCH_BLAS_PREFER_HIPBLASLT: "1"
+    VLLM_FP8_PADDING: "0"
+    VLLM_USE_TRITON_FLASH_ATTN: "0"
+  targetRequests: 1024
+  resourceProfile: amd-gpu-mi300x:8