-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathvllm-4gpu-runtime.yaml
36 lines (36 loc) · 988 Bytes
/
vllm-4gpu-runtime.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
apiVersion: serving.kserve.io/v1alpha1
kind: ServingRuntime
metadata:
annotations:
opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]'
openshift.io/display-name: vLLM 4 GPU
labels:
opendatahub.io/dashboard: "true"
name: vllm-runtime-4-gpu
spec:
annotations:
prometheus.io/path: /metrics
prometheus.io/port: "8080"
containers:
- args:
- --port=8080
- --model=/mnt/models
- --served-model-name={{.Name}}
- --distributed-executor-backend=mp
- --tensor-parallel-size=4
command:
- python
- -m
- vllm.entrypoints.openai.api_server
env:
- name: HF_HOME
value: /tmp/hf_home
image: quay.io/modh/vllm@sha256:60f335015eff8c99508ff421c80f5f7b23b1310d87b0d4086b6f76f9a136b5a4
name: kserve-container
ports:
- containerPort: 8080
protocol: TCP
multiModel: false
supportedModelFormats:
- autoSelect: true
name: vLLM