Skip to content

Commit

Permalink
add model config for Deepseek R1 (#371)
Browse files Browse the repository at this point in the history
  • Loading branch information
samos123 authored Jan 22, 2025
1 parent 2673746 commit 7aff5aa
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 1 deletion.
2 changes: 1 addition & 1 deletion charts/kubeai/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ modelServers:
# upstream vLLM seems to have broken ROCm support, so we are using a fork from AMD.
# Source: https://hub.docker.com/r/rocm/vllm-dev
# Source: https://github.com/ROCm/vllm
amd-gpu: substratusai/vllm-rocm:nightly_main_20250117
amd-gpu: substratusai/vllm-rocm:nightly_main_20250120
OLlama:
images:
default: "ollama/ollama:latest"
Expand Down
27 changes: 27 additions & 0 deletions charts/models/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,33 @@ catalog:
- --disable-log-requests
resourceProfile: nvidia-gpu-gh200:1
targetRequests: 200
deepseek-r1-mi300x:
enabled: false
features: [TextGeneration]
url: hf://deepseek-ai/DeepSeek-R1
engine: VLLM
env:
HIP_FORCE_DEV_KERNARG: "1"
NCCL_MIN_NCHANNELS: "112"
TORCH_BLAS_PREFER_HIPBLASLT: "1"
VLLM_USE_TRITON_FLASH_ATTN: "0"
VLLM_FP8_PADDING: "0"
args:
- --trust-remote-code
# Currently only context length =< 32k supported.
# See: https://github.com/ROCm/vllm/issues/375
- --max-model-len=32768
- --max-num-batched-token=32768
- --max-num-seqs=1024
- --num-scheduler-steps=10
- --tensor-parallel-size=8
- --gpu-memory-utilization=0.90
- --disable-log-requests
- --enable-chunked-prefill=false
- --max-seq-len-to-capture=16384
- --kv-cache-dtype=fp8
resourceProfile: amd-gpu-mi300x:8
targetRequests: 1024
nomic-embed-text-cpu:
enabled: false
features: ["TextEmbedding"]
Expand Down
29 changes: 29 additions & 0 deletions manifests/models/deepseek-r1-mi300x.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Source: models/templates/models.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: deepseek-r1-mi300x
spec:
features: [TextGeneration]
url: hf://deepseek-ai/DeepSeek-R1
engine: VLLM
args:
- --trust-remote-code
- --max-model-len=32768
- --max-num-batched-token=32768
- --max-num-seqs=1024
- --num-scheduler-steps=10
- --tensor-parallel-size=8
- --gpu-memory-utilization=0.90
- --disable-log-requests
- --enable-chunked-prefill=false
- --max-seq-len-to-capture=16384
- --kv-cache-dtype=fp8
env:
HIP_FORCE_DEV_KERNARG: "1"
NCCL_MIN_NCHANNELS: "112"
TORCH_BLAS_PREFER_HIPBLASLT: "1"
VLLM_FP8_PADDING: "0"
VLLM_USE_TRITON_FLASH_ATTN: "0"
targetRequests: 1024
resourceProfile: amd-gpu-mi300x:8

0 comments on commit 7aff5aa

Please sign in to comment.