deploy: add initial vllm worker chart

This change adds the initial version of the vLLM worker Helm chart. This is NOT the final values schema. Includes schema validation and embedded chart scripting to enable default values.
triton-inference-server · Jan 21, 2025 · d497e15 · d497e15
1 parent 6f7815a
commit d497e15
Show file tree

Hide file tree

Showing 3 changed files with 289 additions and 10 deletions.
diff --git a/deploy/Kubernetes/worker/charts/vllm/templates/worker-deployment.yaml b/deploy/Kubernetes/worker/charts/vllm/templates/worker-deployment.yaml
@@ -19,9 +19,24 @@
 {{-   fail "Property '.image' is required." }}
 {{- end }}
 {{- $component_name := "" }}
+{{- $instance_count := 1 }}
+{{- $instance_parallelism := 1 }}
+{{- $k8s_liveness_delay := 10 }}
+{{- $k8s_liveness_enabled := true }}
+{{- $k8s_liveness_fail := 15 }}
+{{- $k8s_liveness_period := 2 }}
+{{- $k8s_liveness_success := 1 }}
+{{- $k8s_readiness_delay := 10 }}
+{{- $k8s_readiness_enabled := true }}
+{{- $k8s_readiness_fail := 15 }}
+{{- $k8s_readiness_period := 2 }}
+{{- $k8s_readiness_success := 1 }}
 {{- $manifold_kind := "nats-io" }}
 {{- $manifold_name := "triton-distributed-manifold" }}
 {{- $manifold_port := 4222 }}
+{{- $parallel_pipeline := 1 }}
+{{- $parallel_tensor := 1 }}
+{{- $parallel_world := 1 }}
 {{- $port_data := 9346 }}
 {{- $port_health := 8000 }}
 {{- $port_metrics := 9347 }}
@@ -32,6 +47,40 @@
 {{- $triton_logging_verbose := 0 }}
 {{- $triton_memory := "16Gi" }}
 {{- $triton_shared := "512Mi" }}
+{{- with $.Values.kubernetes }}
+{{-   with .checks }}
+{{-     with .liveness }}
+{{-       $k8s_liveness_enabled = ne false .enabled }}
+{{-       with .failureThreshold }}
+{{-         $k8s_liveness_fail = (int .) }}
+{{-       end }}
+{{-       with .initialDelaySeconds }}
+{{-         $k8s_liveness_delay = (int .) }}
+{{-       end }}
+{{-       with .periodSeconds }}
+{{-         $k8s_liveness_period = (int .) }}
+{{-       end }}
+{{-       with .successThreshold }}
+{{-         $k8s_liveness_success = (int .) }}
+{{-       end }}
+{{-     end }}
+{{-     with .readiness }}
+{{-       $k8s_readiness_enabled = ne false .enabled }}
+{{-       with .failureThreshold }}
+{{-         $k8s_readiness_fail = (int .) }}
+{{-       end }}
+{{-       with .initialDelaySeconds }}
+{{-         $k8s_readiness_delay = (int .) }}
+{{-       end }}
+{{-       with .periodSeconds }}
+{{-         $k8s_readiness_period = (int .) }}
+{{-       end }}
+{{-       with .successThreshold }}
+{{-         $k8s_readiness_success = (int .) }}
+{{-       end }}
+{{-     end }}
+{{-   end }}
+{{- end }}
 {{- with $.Values.triton }}
 {{-   $component_name = required "Property '.triton.componentName' is required." .componentName }}
 {{-   with .distributed }}
@@ -47,6 +96,20 @@
 {{-       end }}
 {{-     end }}
 {{-   end }}
+{{-   with .instance }}
+{{-     with .count }}
+{{-       $instance_count = (int .) }}
+{{-     end }}
+{{-     with .parallelism }}
+{{-       with .pipeline }}
+{{-         $parallel_pipeline = (int .) }}
+{{-       end }}
+{{-       with .tensor }}
+{{-         $parallel_tensor = (int .) }}
+{{-       end }}
+{{-       $parallel_world = mul $parallel_pipeline $parallel_tensor }}
+{{-     end }}
+{{-   end }}
 {{-   with .logging }}
 {{-     if .useIso8601 }}
 {{-       $triton_logging_iso8601 = 1 }}
@@ -107,7 +170,7 @@ spec:
   selector:
     matchLabels:
       app: {{ $.Release.Name }}
-  replicas: 1
+  replicas: {{ $instance_count }}
   template:
     metadata:
       labels:
@@ -136,7 +199,7 @@ spec:
               - key: nvidia.com/gpu.product
                 operator: In
                 values:
-{{    toYaml . | indent 16 }}
+{{            toYaml . | indent 16 }}
 {{-         end }}
 {{-       end }}
 {{-     end }}
@@ -154,6 +217,10 @@ spec:
 {{- end }}
         env:
         - TRITON_COMPONENT_NAME: {{ $component_name }}
+{{- if gt $parallel_world 1 }}
+        - TRITON_LLM_PP: {{ $parallel_pipeline }}
+        - TRITON_LLM_TP: {{ $parallel_tensor }}
+{{- end }}
         - TRITON_MANIFOLD_KIND: {{ $manifold_kind }}
         - TRITON_MANIFOLD_NAME: {{ $manifold_name }}
         - TRITON_MANIFOLD_PORT: {{ $manifold_port }}
@@ -185,14 +252,16 @@ spec:
 {{- end }}
         image: {{ $triton_image_name }}
         imagePullPolicy: IfNotPresent
+{{- if $k8s_liveness_enabled }}
         livenessProbe:
-          failureThreshold: 15
+          failureThreshold: {{ $k8s_liveness_fail }}
           httpGet:
             path: /v2/health/live
             port: {{ $port_health }}
-          initialDelaySeconds: 10
-          periodSeconds: 2
-          successThreshold: 1
+          initialDelaySeconds: {{ $k8s_liveness_delay }}
+          periodSeconds: {{ $k8s_liveness_period }}
+          successThreshold: {{ $k8s_liveness_success }}
+{{- end }}
         ports:
         - containerPort: {{ $port_health }}
           name: health
@@ -202,14 +271,16 @@ spec:
           name: data
         - containerPort: {{ $port_metrics }}
           name: metrics
+{{- if $k8s_readiness_enabled }}
         readinessProbe:
-          failureThreshold: 15
+          failureThreshold: {{ $k8s_readiness_fail }}
           httpGet:
             path: /v2/health/ready
             port: {{ $port_health }}
-          initialDelaySeconds: 15
-          periodSeconds: 2
-          successThreshold: 1
+          initialDelaySeconds: {{ $k8s_readiness_delay }}
+          periodSeconds: {{ $k8s_readiness_period }}
+          successThreshold: {{ $k8s_readiness_success }}
+{{- end }}
         resources:
           limits:
             cpu: {{ $triton_cpu }}

diff --git a/deploy/Kubernetes/worker/charts/vllm/values.schema.json b/deploy/Kubernetes/worker/charts/vllm/values.schema.json
@@ -46,6 +46,127 @@
     "kubernetes": {
       "description": "Configurations option related to the Kubernetes objects created by the chart.",
       "properties": {
+        "checks": {
+          "description": "Optional configuration options controlling how the cluster monitors the health of Triton Distributed Worker deployment(s).",
+          "properties": {
+            "liveness": {
+              "description": "Configuration options related to how the cluster determines that a Triton Distributed Worker instance is \"alive\" and responsive.",
+              "properties": {
+                "enabled": {
+                  "description": "When `true`, the cluster will actively determine if the pod is alive; otherwise the cluster will always assume the pod is alive.",
+                  "oneOf": [
+                    { "type": "boolean" },
+                    { "type": "null" }
+                  ]
+                },
+                "failureThreshold": {
+                  "description": "Number of failed responses required to determine a pod is not responsive (aka \"alive\").",
+                  "oneOf": [
+                    {
+                      "minimum": 1,
+                      "type": "integer"
+                    },
+                    { "type": "null" }
+                  ]
+                },
+                "initialDelaySeconds": {
+                  "description": "Minimum wait before the cluster beings to attempt to determine the health of the pod.",
+                  "oneOf": [
+                    {
+                      "minimum": 1,
+                      "type": "integer"
+                    },
+                    { "type": "null" }
+                  ]
+                },
+                "periodSeconds": {
+                  "description": "Minimum period between attempts to determine the health of the pod.",
+                  "oneOf": [
+                    {
+                      "minimum": 1,
+                      "type": "integer"
+                    },
+                    { "type": "null" }
+                  ]
+                },
+                "successThreshold": {
+                  "description": "Number of successful responses required to determine that a pod is healthy.",
+                  "oneOf": [
+                    {
+                      "minimum": 1,
+                      "type": "integer"
+                    },
+                    { "type": "null" }
+                  ]
+                }
+              },
+              "oneOf": [
+                { "type": "object" },
+                { "type": "null" }
+              ]
+            },
+            "readiness": {
+              "description": "Configuration options related to how the cluster determines that a Triton Distributed Worker instance is ready.",
+              "properties": {
+                "enabled": {
+                  "description": "When `true`, the cluster will actively determine if the pod is ready; otherwise the cluster will always assume the pod is ready.",
+                  "oneOf": [
+                    { "type": "boolean" },
+                    { "type": "null" }
+                  ]
+                },
+                "failureThreshold": {
+                  "description": "Number of failed responses required to determine a pod is not responsive (aka \"ready\").",
+                  "oneOf": [
+                    {
+                      "minimum": 1,
+                      "type": "integer"
+                    },
+                    { "type": "null" }
+                  ]
+                },
+                "initialDelaySeconds": {
+                  "description": "Minimum wait before the cluster beings to attempt to determine the readiness of the pod.",
+                  "oneOf": [
+                    {
+                      "minimum": 1,
+                      "type": "integer"
+                    },
+                    { "type": "null" }
+                  ]
+                },
+                "periodSeconds": {
+                  "description": "Minimum period between attempts to determine the readiness of the pod.",
+                  "oneOf": [
+                    {
+                      "minimum": 1,
+                      "type": "integer"
+                    },
+                    { "type": "null" }
+                  ]
+                },
+                "successThreshold": {
+                  "description": "Number of successful responses required to determine that a pod is ready.",
+                  "oneOf": [
+                    {
+                      "minimum": 1,
+                      "type": "integer"
+                    },
+                    { "type": "null" }
+                  ]
+                }
+              },
+              "oneOf": [
+                { "type": "object" },
+                { "type": "null" }
+              ]
+            }
+          },
+          "oneOf": [
+            { "type": "object" },
+            { "type": "null" }
+          ]
+        },
         "labels": {
           "description": "Optional set of labels to be applied to created Kubernetes objects. These labels can be used for association with a preexisting service object.",
           "oneOf": [
@@ -170,6 +291,54 @@
                 { "type": "null" }
               ]
             },
+            "instance": {
+              "description": "Optional configuration options related to the number of Triton Distributed Worker pods are deployed.",
+              "properties": {
+                "count": {
+                  "description": "Number of worker instances (whole model) to be deployed as part of this helm chart.",
+                  "oneOf": [
+                    {
+                      "minimum": 1,
+                      "type": "integer"
+                    },
+                    { "type": "null" }
+                  ]
+                },
+                "parallelism": {
+                  "description": "Optional configuration options related to how work for a single model is spread across multiple pods. When the product of `pipeline`*`tensor` is greater than 1, multiple pods will be deployed as a single logical worker.",
+                  "properties": {
+                    "pipeline": {
+                      "description": "Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a subset of layers that is executed on a separate device.",
+                      "oneOf": [
+                        {
+                          "minimum": 1,
+                          "type": "integer"
+                        },
+                        { "type": "null" }
+                      ]
+                    },
+                    "tensor": {
+                      "description": "Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller, independent blocks of computation that can be executed on different devices.",
+                      "oneOf": [
+                        {
+                          "minimum": 1,
+                          "type": "integer"
+                        },
+                        { "type": "null" }
+                      ]
+                    }
+                  },
+                  "oneOf": [
+                    { "type": "object" },
+                    { "type": "null" }
+                  ]
+                }
+              },
+              "oneOf": [
+                { "type": "object" },
+                { "type": "null" }
+              ]
+            },
             "logging": {
               "description": "Logging configuration options specific to Triton Distributed Worker.",
               "properties": {

diff --git a/deploy/Kubernetes/worker/charts/vllm/values.yaml b/deploy/Kubernetes/worker/charts/vllm/values.yaml
@@ -22,6 +22,32 @@ image: # (required)
 
 # Configurations option related to the Kubernetes objects created by the chart.
 kubernetes: # (optional)
+  # Optional configuration options controlling how the cluster monitors the health of Triton Distributed Worker deployment(s).
+  checks:
+    # Configuration options related to how the cluster determines that a Triton Distributed Worker instance is "alive" and responsive.
+    liveness:
+      # When `true`, the cluster will actively determine if the pod is alive; otherwise the cluster will always assume the pod is alive.
+      enabled: # (default true)
+      # Number of failed responses required to determine a pod is not responsive (aka "alive").
+      failureThreshold: # (default 15)
+      # Minimum wait before the cluster beings to attempt to determine the health of the pod.
+      initialDelaySeconds: # (default 10)
+      # Minimum period between attempts to determine the health of the pod.
+      periodSeconds: # (default 2)
+      # Number of successful responses required to determine that a pod is healthy.
+      successThreshold: # (default 1)
+    # Configuration options related to how the cluster determines that a Triton Distributed Worker instance is ready.
+    readiness:
+      # When `true`, the cluster will actively determine if the pod is ready; otherwise the cluster will always assume the pod is ready.
+      enabled: # (default true)
+      # Number of failed responses required to determine a pod is not responsive (aka "ready").
+      failureThreshold: # (default 15)
+      # Minimum wait before the cluster beings to attempt to determine the readiness of the pod.
+      initialDelaySeconds: # (default 10)
+      # Minimum period between attempts to determine the readiness of the pod.
+      periodSeconds: # (default 2)
+      # Number of successful responses required to determine that a pod is ready.
+      successThreshold: # (default 1)
   # Optional set of labels to be applied to created Kubernetes objects.
   # These labels can be used for association with a preexisting service object.
   labels: # (optional)
@@ -61,6 +87,19 @@ triton: # (required)
       serviceName: # (default triton-distributed-manifold)
       # Networking port to be used to interact with the Triton Distributed Manifold.
       servicePort: # (default 4222)
+  # Optional configuration options related to the number of Triton Distributed Worker pods are deployed.
+  instance:
+    # Number of worker instances (whole model) to be deployed as part of this helm chart.
+    count: # (default 1)
+    # Optional configuration options related to how work for a single model is spread across multiple pods.
+    # When the product of `pipeline`*`tensor` is greater than 1, multiple pods will be deployed as a single logical worker.
+    parallelism:
+      # Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a
+      # subset of layers that is executed on a separate device.
+      pipeline: # (default 1)
+      # Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller,
+      # independent blocks of computation that can be executed on different devices.
+      tensor: # (default 1)
   # Logging configuration options specific to Triton Distributed Worker.
   logging: # (optional)
     # When `true` Triton Distributed Worker logs are formatted using the ISO8601 standard; otherwise Triton's default format will be used.