Skip to content

Commit

Permalink
deploy: add initial vllm worker chart
Browse files Browse the repository at this point in the history
This change adds the initial version of the vLLM worker Helm chart.

This is NOT the final values schema.

Includes schema validation and embedded chart scripting to enable default values.
  • Loading branch information
whoisj committed Jan 21, 2025
1 parent 6f7815a commit d497e15
Show file tree
Hide file tree
Showing 3 changed files with 289 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,24 @@
{{- fail "Property '.image' is required." }}
{{- end }}
{{- $component_name := "" }}
{{- $instance_count := 1 }}
{{- $instance_parallelism := 1 }}
{{- $k8s_liveness_delay := 10 }}
{{- $k8s_liveness_enabled := true }}
{{- $k8s_liveness_fail := 15 }}
{{- $k8s_liveness_period := 2 }}
{{- $k8s_liveness_success := 1 }}
{{- $k8s_readiness_delay := 10 }}
{{- $k8s_readiness_enabled := true }}
{{- $k8s_readiness_fail := 15 }}
{{- $k8s_readiness_period := 2 }}
{{- $k8s_readiness_success := 1 }}
{{- $manifold_kind := "nats-io" }}
{{- $manifold_name := "triton-distributed-manifold" }}
{{- $manifold_port := 4222 }}
{{- $parallel_pipeline := 1 }}
{{- $parallel_tensor := 1 }}
{{- $parallel_world := 1 }}
{{- $port_data := 9346 }}
{{- $port_health := 8000 }}
{{- $port_metrics := 9347 }}
Expand All @@ -32,6 +47,40 @@
{{- $triton_logging_verbose := 0 }}
{{- $triton_memory := "16Gi" }}
{{- $triton_shared := "512Mi" }}
{{- with $.Values.kubernetes }}
{{- with .checks }}
{{- with .liveness }}
{{- $k8s_liveness_enabled = ne false .enabled }}
{{- with .failureThreshold }}
{{- $k8s_liveness_fail = (int .) }}
{{- end }}
{{- with .initialDelaySeconds }}
{{- $k8s_liveness_delay = (int .) }}
{{- end }}
{{- with .periodSeconds }}
{{- $k8s_liveness_period = (int .) }}
{{- end }}
{{- with .successThreshold }}
{{- $k8s_liveness_success = (int .) }}
{{- end }}
{{- end }}
{{- with .readiness }}
{{- $k8s_readiness_enabled = ne false .enabled }}
{{- with .failureThreshold }}
{{- $k8s_readiness_fail = (int .) }}
{{- end }}
{{- with .initialDelaySeconds }}
{{- $k8s_readiness_delay = (int .) }}
{{- end }}
{{- with .periodSeconds }}
{{- $k8s_readiness_period = (int .) }}
{{- end }}
{{- with .successThreshold }}
{{- $k8s_readiness_success = (int .) }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- with $.Values.triton }}
{{- $component_name = required "Property '.triton.componentName' is required." .componentName }}
{{- with .distributed }}
Expand All @@ -47,6 +96,20 @@
{{- end }}
{{- end }}
{{- end }}
{{- with .instance }}
{{- with .count }}
{{- $instance_count = (int .) }}
{{- end }}
{{- with .parallelism }}
{{- with .pipeline }}
{{- $parallel_pipeline = (int .) }}
{{- end }}
{{- with .tensor }}
{{- $parallel_tensor = (int .) }}
{{- end }}
{{- $parallel_world = mul $parallel_pipeline $parallel_tensor }}
{{- end }}
{{- end }}
{{- with .logging }}
{{- if .useIso8601 }}
{{- $triton_logging_iso8601 = 1 }}
Expand Down Expand Up @@ -107,7 +170,7 @@ spec:
selector:
matchLabels:
app: {{ $.Release.Name }}
replicas: 1
replicas: {{ $instance_count }}
template:
metadata:
labels:
Expand Down Expand Up @@ -136,7 +199,7 @@ spec:
- key: nvidia.com/gpu.product
operator: In
values:
{{ toYaml . | indent 16 }}
{{ toYaml . | indent 16 }}
{{- end }}
{{- end }}
{{- end }}
Expand All @@ -154,6 +217,10 @@ spec:
{{- end }}
env:
- TRITON_COMPONENT_NAME: {{ $component_name }}
{{- if gt $parallel_world 1 }}
- TRITON_LLM_PP: {{ $parallel_pipeline }}
- TRITON_LLM_TP: {{ $parallel_tensor }}
{{- end }}
- TRITON_MANIFOLD_KIND: {{ $manifold_kind }}
- TRITON_MANIFOLD_NAME: {{ $manifold_name }}
- TRITON_MANIFOLD_PORT: {{ $manifold_port }}
Expand Down Expand Up @@ -185,14 +252,16 @@ spec:
{{- end }}
image: {{ $triton_image_name }}
imagePullPolicy: IfNotPresent
{{- if $k8s_liveness_enabled }}
livenessProbe:
failureThreshold: 15
failureThreshold: {{ $k8s_liveness_fail }}
httpGet:
path: /v2/health/live
port: {{ $port_health }}
initialDelaySeconds: 10
periodSeconds: 2
successThreshold: 1
initialDelaySeconds: {{ $k8s_liveness_delay }}
periodSeconds: {{ $k8s_liveness_period }}
successThreshold: {{ $k8s_liveness_success }}
{{- end }}
ports:
- containerPort: {{ $port_health }}
name: health
Expand All @@ -202,14 +271,16 @@ spec:
name: data
- containerPort: {{ $port_metrics }}
name: metrics
{{- if $k8s_readiness_enabled }}
readinessProbe:
failureThreshold: 15
failureThreshold: {{ $k8s_readiness_fail }}
httpGet:
path: /v2/health/ready
port: {{ $port_health }}
initialDelaySeconds: 15
periodSeconds: 2
successThreshold: 1
initialDelaySeconds: {{ $k8s_readiness_delay }}
periodSeconds: {{ $k8s_readiness_period }}
successThreshold: {{ $k8s_readiness_success }}
{{- end }}
resources:
limits:
cpu: {{ $triton_cpu }}
Expand Down
169 changes: 169 additions & 0 deletions deploy/Kubernetes/worker/charts/vllm/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,127 @@
"kubernetes": {
"description": "Configurations option related to the Kubernetes objects created by the chart.",
"properties": {
"checks": {
"description": "Optional configuration options controlling how the cluster monitors the health of Triton Distributed Worker deployment(s).",
"properties": {
"liveness": {
"description": "Configuration options related to how the cluster determines that a Triton Distributed Worker instance is \"alive\" and responsive.",
"properties": {
"enabled": {
"description": "When `true`, the cluster will actively determine if the pod is alive; otherwise the cluster will always assume the pod is alive.",
"oneOf": [
{ "type": "boolean" },
{ "type": "null" }
]
},
"failureThreshold": {
"description": "Number of failed responses required to determine a pod is not responsive (aka \"alive\").",
"oneOf": [
{
"minimum": 1,
"type": "integer"
},
{ "type": "null" }
]
},
"initialDelaySeconds": {
"description": "Minimum wait before the cluster beings to attempt to determine the health of the pod.",
"oneOf": [
{
"minimum": 1,
"type": "integer"
},
{ "type": "null" }
]
},
"periodSeconds": {
"description": "Minimum period between attempts to determine the health of the pod.",
"oneOf": [
{
"minimum": 1,
"type": "integer"
},
{ "type": "null" }
]
},
"successThreshold": {
"description": "Number of successful responses required to determine that a pod is healthy.",
"oneOf": [
{
"minimum": 1,
"type": "integer"
},
{ "type": "null" }
]
}
},
"oneOf": [
{ "type": "object" },
{ "type": "null" }
]
},
"readiness": {
"description": "Configuration options related to how the cluster determines that a Triton Distributed Worker instance is ready.",
"properties": {
"enabled": {
"description": "When `true`, the cluster will actively determine if the pod is ready; otherwise the cluster will always assume the pod is ready.",
"oneOf": [
{ "type": "boolean" },
{ "type": "null" }
]
},
"failureThreshold": {
"description": "Number of failed responses required to determine a pod is not responsive (aka \"ready\").",
"oneOf": [
{
"minimum": 1,
"type": "integer"
},
{ "type": "null" }
]
},
"initialDelaySeconds": {
"description": "Minimum wait before the cluster beings to attempt to determine the readiness of the pod.",
"oneOf": [
{
"minimum": 1,
"type": "integer"
},
{ "type": "null" }
]
},
"periodSeconds": {
"description": "Minimum period between attempts to determine the readiness of the pod.",
"oneOf": [
{
"minimum": 1,
"type": "integer"
},
{ "type": "null" }
]
},
"successThreshold": {
"description": "Number of successful responses required to determine that a pod is ready.",
"oneOf": [
{
"minimum": 1,
"type": "integer"
},
{ "type": "null" }
]
}
},
"oneOf": [
{ "type": "object" },
{ "type": "null" }
]
}
},
"oneOf": [
{ "type": "object" },
{ "type": "null" }
]
},
"labels": {
"description": "Optional set of labels to be applied to created Kubernetes objects. These labels can be used for association with a preexisting service object.",
"oneOf": [
Expand Down Expand Up @@ -170,6 +291,54 @@
{ "type": "null" }
]
},
"instance": {
"description": "Optional configuration options related to the number of Triton Distributed Worker pods are deployed.",
"properties": {
"count": {
"description": "Number of worker instances (whole model) to be deployed as part of this helm chart.",
"oneOf": [
{
"minimum": 1,
"type": "integer"
},
{ "type": "null" }
]
},
"parallelism": {
"description": "Optional configuration options related to how work for a single model is spread across multiple pods. When the product of `pipeline`*`tensor` is greater than 1, multiple pods will be deployed as a single logical worker.",
"properties": {
"pipeline": {
"description": "Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a subset of layers that is executed on a separate device.",
"oneOf": [
{
"minimum": 1,
"type": "integer"
},
{ "type": "null" }
]
},
"tensor": {
"description": "Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller, independent blocks of computation that can be executed on different devices.",
"oneOf": [
{
"minimum": 1,
"type": "integer"
},
{ "type": "null" }
]
}
},
"oneOf": [
{ "type": "object" },
{ "type": "null" }
]
}
},
"oneOf": [
{ "type": "object" },
{ "type": "null" }
]
},
"logging": {
"description": "Logging configuration options specific to Triton Distributed Worker.",
"properties": {
Expand Down
39 changes: 39 additions & 0 deletions deploy/Kubernetes/worker/charts/vllm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,32 @@ image: # (required)

# Configurations option related to the Kubernetes objects created by the chart.
kubernetes: # (optional)
# Optional configuration options controlling how the cluster monitors the health of Triton Distributed Worker deployment(s).
checks:
# Configuration options related to how the cluster determines that a Triton Distributed Worker instance is "alive" and responsive.
liveness:
# When `true`, the cluster will actively determine if the pod is alive; otherwise the cluster will always assume the pod is alive.
enabled: # (default true)
# Number of failed responses required to determine a pod is not responsive (aka "alive").
failureThreshold: # (default 15)
# Minimum wait before the cluster beings to attempt to determine the health of the pod.
initialDelaySeconds: # (default 10)
# Minimum period between attempts to determine the health of the pod.
periodSeconds: # (default 2)
# Number of successful responses required to determine that a pod is healthy.
successThreshold: # (default 1)
# Configuration options related to how the cluster determines that a Triton Distributed Worker instance is ready.
readiness:
# When `true`, the cluster will actively determine if the pod is ready; otherwise the cluster will always assume the pod is ready.
enabled: # (default true)
# Number of failed responses required to determine a pod is not responsive (aka "ready").
failureThreshold: # (default 15)
# Minimum wait before the cluster beings to attempt to determine the readiness of the pod.
initialDelaySeconds: # (default 10)
# Minimum period between attempts to determine the readiness of the pod.
periodSeconds: # (default 2)
# Number of successful responses required to determine that a pod is ready.
successThreshold: # (default 1)
# Optional set of labels to be applied to created Kubernetes objects.
# These labels can be used for association with a preexisting service object.
labels: # (optional)
Expand Down Expand Up @@ -61,6 +87,19 @@ triton: # (required)
serviceName: # (default triton-distributed-manifold)
# Networking port to be used to interact with the Triton Distributed Manifold.
servicePort: # (default 4222)
# Optional configuration options related to the number of Triton Distributed Worker pods are deployed.
instance:
# Number of worker instances (whole model) to be deployed as part of this helm chart.
count: # (default 1)
# Optional configuration options related to how work for a single model is spread across multiple pods.
# When the product of `pipeline`*`tensor` is greater than 1, multiple pods will be deployed as a single logical worker.
parallelism:
# Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a
# subset of layers that is executed on a separate device.
pipeline: # (default 1)
# Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller,
# independent blocks of computation that can be executed on different devices.
tensor: # (default 1)
# Logging configuration options specific to Triton Distributed Worker.
logging: # (optional)
# When `true` Triton Distributed Worker logs are formatted using the ISO8601 standard; otherwise Triton's default format will be used.
Expand Down

0 comments on commit d497e15

Please sign in to comment.