deploy: add initial vllm worker chart

This change adds the initial version of the vLLM worker Helm chart. This is NOT the final values schema. Includes schema validation and embedded chart scripting to enable default values.
triton-inference-server · Jan 21, 2025 · 2daf46e · 2daf46e
1 parent 6f7815a
commit 2daf46e
Show file tree

Hide file tree

Showing 5 changed files with 438 additions and 81 deletions.
diff --git a/deploy/Kubernetes/worker/charts/vllm/Chart.yaml b/deploy/Kubernetes/worker/charts/vllm/Chart.yaml
@@ -14,7 +14,7 @@
 
 apiVersion: v2
 appVersion: 1.0.0
-description: Triton Distributed Worker Operator for vLLM
+description: Triton Distributed Worker for vLLM
 icon: https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/[email protected]
 name: triton-distributed_worker-vllm
 version: 1.0.0
diff --git a/deploy/Kubernetes/worker/charts/vllm/templates/_helpers.tpl b/deploy/Kubernetes/worker/charts/vllm/templates/_helpers.tpl
@@ -0,0 +1,88 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Annotation Groups
+{{- define "annotations.default" }}
+triton-distributed: "{{ .Release.Name }}.{{ .Chart.AppVersion | default "0.0" }}"
+{{-   with .Values.kubernetes }}
+{{-     with .annotations }}
+{{        toYaml . }}
+{{-     end }}
+{{-   end }}
+{{- end -}}
+
+{{- define "annotations.chart" }}
+helm.sh/chart: {{ .Chart.Name | quote }}
+{{-   template "annotations.default" . }}
+{{- end -}}
+
+# Label Groups
+{{- define "labels.default" }}
+{{-   template "label.appInstance" . }}
+{{-   template "label.appName" . }}
+{{-   template "label.appPartOf" . }}
+{{-   template "label.appVersion" . }}
+{{- end -}}
+
+{{- define "labels.chart" }}
+{{-   template "labels.default" . }}
+{{-   template "label.appManagedBy" . }}
+{{-   template "label.chart" . }}
+{{-   with .Values.kubernetes }}
+{{-     with .labels }}
+{{        toYaml . }}
+{{-     end }}
+{{-   end }}
+{{-   template "label.release" . }}
+{{- end -}}
+
+# Label Values
+{{- define "label.appInstance" }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{- define "label.appManagedBy" }}
+{{-   $service_name := "triton-distributed" }}
+{{-   with .Release.Service }}
+{{-     $service_name = . }}
+{{-   end }}
+app.kubernetes.io/managed-by: {{ $service_name }}
+{{- end }}
+
+{{- define "label.appName" }}
+app.kubernetes.io/name: {{ required "Property '.triton.componentName' is required." .Values.triton.componentName }}
+{{- end }}
+
+{{- define "label.appPartOf" }}
+{{-   $part_of := "triton-distributed" }}
+{{-   with .Values.kubernetes }}
+{{-     with .partOf }}
+{{-       $part_of = . }}
+{{-     end }}
+{{-   end }}
+app.kubernetes.io/part-of: {{ $part_of }}
+{{- end }}
+
+{{- define "label.appVersion" }}
+app.kubernetes.io/version: {{ .Chart.Version | default "0.0" | quote }}
+{{- end }}
+
+{{- define "label.chart" }}
+helm.sh/chart: {{ .Chart.Name | quote }}
+helm.sh/version: {{ .Chart.Version | default "0.0" | quote }}
+{{- end }}
+
+{{- define "label.release" }}
+release: "{{ .Chart.Name }}_v{{ .Chart.Version | default "0.0" }}"
+{{- end }}
diff --git a/deploy/Kubernetes/worker/charts/vllm/templates/worker-deployment.yaml b/deploy/Kubernetes/worker/charts/vllm/templates/worker-deployment.yaml
@@ -19,32 +19,95 @@
 {{-   fail "Property '.image' is required." }}
 {{- end }}
 {{- $component_name := "" }}
-{{- $manifold_kind := "nats-io" }}
-{{- $manifold_name := "triton-distributed-manifold" }}
-{{- $manifold_port := 4222 }}
+{{- $instance_count := 1 }}
+{{- $k8s_liveness_delay := 10 }}
+{{- $k8s_liveness_enabled := true }}
+{{- $k8s_liveness_fail := 15 }}
+{{- $k8s_liveness_period := 2 }}
+{{- $k8s_liveness_success := 1 }}
+{{- $k8s_readiness_delay := 10 }}
+{{- $k8s_readiness_enabled := true }}
+{{- $k8s_readiness_fail := 15 }}
+{{- $k8s_readiness_period := 2 }}
+{{- $k8s_readiness_success := 1 }}
+{{- $request_plane_kind := "nats-io" }}
+{{- $request_plane_name := "triton-distributed_request-plane" }}
+{{- $request_plane_port := 4222 }}
+{{- $parallel_pipeline := 1 }}
+{{- $parallel_tensor := 1 }}
+{{- $parallel_world := 1 }}
 {{- $port_data := 9346 }}
 {{- $port_health := 8000 }}
 {{- $port_metrics := 9347 }}
 {{- $port_request := 9345 }}
 {{- $triton_cpu := 4 }}
 {{- $triton_ephemeral := "1Gi" }}
+{{- $triton_gpu := 1 }}
 {{- $triton_logging_iso8601 := 0 }}
 {{- $triton_logging_verbose := 0 }}
 {{- $triton_memory := "16Gi" }}
 {{- $triton_shared := "512Mi" }}
+{{- with $.Values.kubernetes }}
+{{-   with .checks }}
+{{-     with .liveness }}
+{{-       $k8s_liveness_enabled = ne false .enabled }}
+{{-       with .failureThreshold }}
+{{-         $k8s_liveness_fail = (int .) }}
+{{-       end }}
+{{-       with .initialDelaySeconds }}
+{{-         $k8s_liveness_delay = (int .) }}
+{{-       end }}
+{{-       with .periodSeconds }}
+{{-         $k8s_liveness_period = (int .) }}
+{{-       end }}
+{{-       with .successThreshold }}
+{{-         $k8s_liveness_success = (int .) }}
+{{-       end }}
+{{-     end }}
+{{-     with .readiness }}
+{{-       $k8s_readiness_enabled = ne false .enabled }}
+{{-       with .failureThreshold }}
+{{-         $k8s_readiness_fail = (int .) }}
+{{-       end }}
+{{-       with .initialDelaySeconds }}
+{{-         $k8s_readiness_delay = (int .) }}
+{{-       end }}
+{{-       with .periodSeconds }}
+{{-         $k8s_readiness_period = (int .) }}
+{{-       end }}
+{{-       with .successThreshold }}
+{{-         $k8s_readiness_success = (int .) }}
+{{-       end }}
+{{-     end }}
+{{-   end }}
+{{- end }}
 {{- with $.Values.triton }}
 {{-   $component_name = required "Property '.triton.componentName' is required." .componentName }}
 {{-   with .distributed }}
 {{-     with .manifold }}
 {{-       with .serverKind }}
-{{-         $manifold_kind = . }}
+{{-         $request_plane_kind = . }}
 {{-       end }}
 {{-       with .serviceName }}
-{{-         $manifold_name = . }}
+{{-         $request_plane_name = . }}
 {{-       end }}
 {{-       with .servicePort }}
-{{-         $manifold_port = (int .) }}
+{{-         $request_plane_port = (int .) }}
+{{-       end }}
+{{-     end }}
+{{-   end }}
+{{-   with .instance }}
+{{-     with .count }}
+{{-       $instance_count = (int .) }}
+{{-     end }}
+{{-     with .parallelism }}
+{{-       with .pipeline }}
+{{-         $parallel_pipeline = (int .) }}
+{{-       end }}
+{{-       with .tensor }}
+{{-         $parallel_tensor = (int .) }}
 {{-       end }}
+{{-       $parallel_world = mul $parallel_pipeline $parallel_tensor }}
 {{-     end }}
 {{-   end }}
 {{-   with .logging }}
@@ -76,6 +139,9 @@
 {{-     with .ephemeral }}
 {{-       $triton_ephemeral = . }}
 {{-     end }}
+{{-     with .gpu }}
+{{-       $triton_gpu = (int .count) }}
+{{-     end }}
 {{-     with .memory }}
 {{-       $triton_memory = . }}
 {{-     end }}
@@ -89,54 +155,52 @@
 {{- $model_repo_path := "/var/run/models" }}
 {{- with $.Values.modelRepository }}
 {{-   with .path }}
-{{-     $model_repo_path = . }}
+{{-     $model_repo_path = trimSuffix "/" . }}
 {{-   end }}
 {{- end }}
 apiVersion: apps/v1
 kind: Deployment
 metadata:
   name: {{ $.Release.Name }}
+  annotations:
+{{- include "annotations.chart" . | indent 4 }}
   labels:
     app: {{ $.Release.Name }}
-{{- with $.Values.kubernetes }}
-{{-   with .labels }}
-{{      toYaml . | indent 4 }}
-{{-   end }}
-{{- end }}
+    app.kubernetes.io/component: worker
+{{- include "labels.chart" . | indent 4 }}
 spec:
   selector:
     matchLabels:
       app: {{ $.Release.Name }}
-  replicas: 1
+      app.kubernetes.io/component: worker
+  replicas: {{ $instance_count }}
   template:
     metadata:
+      annotations:
+{{- include "annotations.chart" . | indent 8 }}
       labels:
         app: {{ $.Release.Name }}
         app.kubernetes.io/component: worker
-        app.kubernetes.io/name: {{ $component_name }}
-        app.kubernetes.io/part-of: triton-distributed
-{{- with $.Values.kubernetes }}
-{{-   with .labels }}
-{{      toYaml . | indent 8 }}
-{{-   end }}
-{{- end }}
+{{- include "labels.chart" . | indent 8 }}
     spec:
+{{- if ne $triton_gpu 0 }}
       affinity:
         nodeAffinity:
-{{- with $.Values.triton }}
-{{-   with .resources }}
-{{-     with .gpu }}
-{{-       with .product }}
           requiredDuringSchedulingIgnoredDuringExecution:
             nodeSelectorTerms:
             - matchExpressions:
               - key: nvidia.com/gpu
                 operator: Exists
-{{-         if len . }}
+{{-   with $.Values.triton }}
+{{-     with .resources }}
+{{-       with .gpu }}
+{{-         with .product }}
+{{-           if len . }}
               - key: nvidia.com/gpu.product
                 operator: In
                 values:
-{{    toYaml . | indent 16 }}
+{{            toYaml . | indent 16 }}
+{{-           end }}
 {{-         end }}
 {{-       end }}
 {{-     end }}
@@ -154,9 +218,13 @@ spec:
 {{- end }}
         env:
         - TRITON_COMPONENT_NAME: {{ $component_name }}
-        - TRITON_MANIFOLD_KIND: {{ $manifold_kind }}
-        - TRITON_MANIFOLD_NAME: {{ $manifold_name }}
-        - TRITON_MANIFOLD_PORT: {{ $manifold_port }}
+{{- if gt $parallel_world 1 }}
+        - TRITON_LLM_PP: {{ $parallel_pipeline }}
+        - TRITON_LLM_TP: {{ $parallel_tensor }}
+{{- end }}
+        - TRITON_request_plane_kind: {{ $request_plane_kind }}
+        - TRITON_request_plane_name: {{ $request_plane_name }}
+        - TRITON_request_plane_port: {{ $request_plane_port }}
         - TRITON_MODEL_REPOSITORY: {{ $model_repo_path }}
 {{- if ne $port_data 9346 }}
         - TRITON_PORT_DATA: {{ $port_data }}
@@ -185,14 +253,16 @@ spec:
 {{- end }}
         image: {{ $triton_image_name }}
         imagePullPolicy: IfNotPresent
+{{- if $k8s_liveness_enabled }}
         livenessProbe:
-          failureThreshold: 15
+          failureThreshold: {{ $k8s_liveness_fail }}
           httpGet:
             path: /v2/health/live
             port: {{ $port_health }}
-          initialDelaySeconds: 10
-          periodSeconds: 2
-          successThreshold: 1
+          initialDelaySeconds: {{ $k8s_liveness_delay }}
+          periodSeconds: {{ $k8s_liveness_period }}
+          successThreshold: {{ $k8s_liveness_success }}
+{{- end }}
         ports:
         - containerPort: {{ $port_health }}
           name: health
@@ -202,55 +272,48 @@ spec:
           name: data
         - containerPort: {{ $port_metrics }}
           name: metrics
+{{- if $k8s_readiness_enabled }}
         readinessProbe:
-          failureThreshold: 15
+          failureThreshold: {{ $k8s_readiness_fail }}
           httpGet:
             path: /v2/health/ready
             port: {{ $port_health }}
-          initialDelaySeconds: 15
-          periodSeconds: 2
-          successThreshold: 1
+          initialDelaySeconds: {{ $k8s_readiness_delay }}
+          periodSeconds: {{ $k8s_readiness_period }}
+          successThreshold: {{ $k8s_readiness_success }}
+{{- end }}
         resources:
           limits:
             cpu: {{ $triton_cpu }}
             ephemeral-storage: {{ $triton_ephemeral }}
             memory: {{ $triton_memory }}
-{{- with $.Values.triton }}
-{{-   with .resources }}
-{{-     with .gpu }}
-{{-       $triton_gpu := 1 }}
-{{-       with .count }}
-{{-         $triton_gpu = (int .) }}
-{{-       end}}
+{{- if gt $triton_gpu 0 }}
             nvidia.com/gpu: {{ $triton_gpu }}
-{{-     end }}
-{{-   end }}
 {{- end }}
           requests:
-            cpu: {{ $triton_cpu }}
+            cpu: {{ mul $triton_cpu }}
             ephemeral-storage: {{ $triton_ephemeral }}
             memory: {{ $triton_memory }}
-{{- with $.Values.triton }}
-{{-   with .resources }}
-{{-     with .gpu }}
-{{-       $triton_gpu := 1 }}
-{{-       with .count }}
-{{-         $triton_gpu = (int .) }}
-{{-       end}}
+{{- if gt $triton_gpu 0 }}
             nvidia.com/gpu: {{ $triton_gpu }}
-{{-     end }}
-{{-   end }}
 {{- end }}
         volumeMounts:
 {{- with $.Values.modelRepository }}
 {{-   with .volumeMounts }}
 {{-     range . }}
 {{-       $mount_path := $model_repo_path }}
+{{-       $volume_name := required "Property '.modelRepository.volumeMounts[*].name' is required." .name }}
+{{-       if eq "shared-memory" $volume_name }}
+{{-         fail "Property '.modelRepository.volumeMounts[*].name' cannot be `shared-memory` because it is a reserved name." }}
+{{-       end }}
 {{-       with .path }}
-{{-         $mount_path = cat $model_repo_path "/" . }}
+{{-         $mount_path = printf "%s/%s" $model_repo_path (trimPrefix "/" .) }}
+{{-         if regexMatch "/\\.\\./?" $mount_path }}
+{{-           fail (printf "Value of property `.modelRepository.volumeMounts[*].path' `%s` is illegal because `%s` is not a sub-directory of `%s`." . (clean $mount_path) $model_repo_path) }}
+{{-         end }}
 {{-       end }}
         - mountPath: {{ $mount_path }}
-          name: {{ required "Property '.modelRepository.volumeMounts.name' is required." .name }}
+          name: {{ $volume_name  }}
 {{-     end }}
 {{-   end }}
 {{- end }}