Gen AI Tutorial: Remove VLLM option for initial helm chart

This change removes the option of deploying w/ VLLM for the initial version of the tutorial. The option was removed due to several deployment bugs it introduced.
triton-inference-server · Jun 7, 2024 · 0c1b414 · 0c1b414
1 parent 9bf511e
commit 0c1b414
Show file tree

Hide file tree

Showing 9 changed files with 25 additions and 42 deletions.
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# See values.yaml for reference values.
+
 gpu:
 - Tesla-T4
 - Tesla-V100-SXM2-16GB

diff --git a/.../Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml b/.../Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml
@@ -15,6 +15,7 @@
 # See values.yaml for reference values.
 
 gpu:
+- Tesla-T4
 - Tesla-V100-SXM2-16GB
 
 model:

diff --git a/...yment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml b/...yment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml
@@ -15,8 +15,11 @@
 # See values.yaml for reference values.
 
 gpu:
-- NVIDIA-A10G
-- NVIDIA-A100-SXM4-40GB
+- Tesla-T4
+- Tesla-V100-SXM2-16GB
 
 model:
   name: llama-2-7b
+  tensorrtLlm:
+    parallelism:
+      tensor: 2
diff --git a/...ernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml b/...ernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml
@@ -15,6 +15,7 @@
 # See values.yaml for reference values.
 
 gpu:
+- Tesla-T4
 - Tesla-V100-SXM2-16GB
 
 model:

diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml
@@ -15,8 +15,8 @@
 # See values.yaml for reference values.
 
 gpu:
-- Tesla-V100-SXM2-16GB
 - Tesla-T4
+- Tesla-V100-SXM2-16GB
 
 model:
   name: opt125m
diff --git a/...nt/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/deployment.yaml b/...nt/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/deployment.yaml
@@ -27,7 +27,6 @@
 {{- $model_dt := "float16" }}
 {{- $model_pp := 1 }}
 {{- $model_tp := 1 }}
-{{- $model_trtllm := true }}
 {{- with $.Values.kubernetes }}
 {{-   with .hostRootPath }}
 {{-     $hostRootPath = . }}
@@ -36,7 +35,6 @@
 {{- with $.Values.model }}
 {{-   $model_name = required "Property '.model.name' is required." .name }}
 {{-   with .tensorrtLlm }}
-{{-     $model_trtllm = .enable }}
 {{-     with .dataType }}
 {{-       $model_dt = . }}
 {{-     end }}
@@ -123,14 +121,10 @@ spec:
         - python3
         - ./server.py
         - exec
-{{- if $model_trtllm }}
         - --engine=trtllm
         - --dt={{ $model_dt }}
         - --pp={{ $model_pp }}
         - --tp={{ $model_tp }}
-{{- else }}
-        - --engine=vllm
-{{- end }}
 {{- with $.Values.logging }}
 {{-   with .tritonServer }}
 {{-     if .useIso8601 }}
@@ -191,11 +185,9 @@ spec:
             memory: {{ $triton_memory }}
             nvidia.com/gpu: {{ $model_gpus }}
         volumeMounts:
-{{- if $model_trtllm }}
         - mountPath: /var/run/engines
           name: engine-repository
           readOnly: false
-{{- end }}
         - mountPath: /var/run/models
           name: model-repository
           readOnly: true
@@ -217,14 +209,10 @@ spec:
         - ./server.py
         - init
         - --model={{ $model_lower }}
-{{- if $model_trtllm }}
         - --engine=trtllm
         - --dt={{ $model_dt }}
         - --pp={{ $model_pp }}
         - --tp={{ $model_tp }}
-{{- else }}
-        - --engine=vllm
-{{- end }}
 {{- with $.Values.logging }}
 {{-   with .initialization }}
 {{-     if .verbose }}
@@ -267,11 +255,9 @@ spec:
             ephemeral-storage: 96Gi
             nvidia.com/gpu: {{ $model_gpus }}
         volumeMounts:
-{{- if $model_trtllm }}
         - mountPath: /var/run/engines
           name: engine-repository
           readOnly: false
-{{- end }}
         - mountPath: /var/run/models
           name: model-repository
           readOnly: false
@@ -297,19 +283,13 @@ spec:
 {{-   end }}
 {{- end }}
       volumes:
-{{- if $model_trtllm }}
       - name: engine-repository
         hostPath:
           path: {{ printf "%s/models/%s/%dx%d/engines" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }}
           type: DirectoryOrCreate
-{{- end }}
       - name: model-repository
         hostPath:
-{{- if $model_trtllm }}
           path: {{ printf "%s/models/%s/%dx%d/models" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }}
-{{- else }}
-          path: {{ printf "%s/models/%s/vllm" $hostRootPath $model_lower }}
-{{- end }}
           type: DirectoryOrCreate
 {{- with $.Values.model }}
 {{-   with .pullSecret }}

diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.schema.json b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.schema.json
@@ -54,9 +54,7 @@
                 "enable": {
                   "description": "When `true`, enables conversion of models into TensorRT format before loading them into Triton Server.",
                   "oneOf": [
-                    {
-                      "type": "boolean"
-                    },
+                    { "type": "boolean" },
                     { "type": "null" }
                   ]
                 },

diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.yaml
@@ -39,9 +39,6 @@ model: # (required)
   name: # (required)
   # Configuration options related to the conversion of a non-optimized model into TensorRT format.
   tensorrtLlm: # (optional)
-    # When `true`, enables conversion of models into TensorRT format before loading them into Triton Server.
-    # When 'false', the init container will fall back to vLLM and parallelism options are ignored.
-    enable: # (default: true)
     # Data type used when compiling and optimizing the model for TensorRT.
     # Supported options are float16, bfloat16, float32
     dataType: # (default: float16)

diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.py b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.py
@@ -88,7 +88,9 @@ def hugging_face_authenticate(args):
             hugging_face_token = token_file.read()
 
             # Use Hugging Face's CLI to complete the authentication.
-            result = run_command([HUGGING_FACE_CLI, "login", "--token"], [hugging_face_token])
+            result = run_command(
+                [HUGGING_FACE_CLI, "login", "--token"], [hugging_face_token]
+            )
 
             if result != 0:
                 raise Exception(f"Hugging Face authentication failed. ({result})")
@@ -165,17 +167,20 @@ def execute_triton(args):
         cmd_args = ["mpirun", "--allow-run-as-root"]
 
         for i in range(world_size):
+            if i != 0:
+                cmd_args += [":"]
+
             cmd_args += [
                 "-n",
                 "1",
                 "tritonserver",
-                f"--model-repository={MODEL_DIRECTORY}",
-                "--disable-auto-complete-config",
-            ]
-            cmd_args += [
+                f"--id=rank{i}",
                 f"--http-port={(8000 + i * 10)}",
                 f"--grpc-port={(8001 + i * 10)}",
                 "--model-load-thread-count=2",
+                f"--model-repository={MODEL_DIRECTORY}",
+                "--disable-auto-complete-config",
+                f"--backend-config=python,shm-region-prefix-name=rank{i}_",
             ]
 
             if i == 0:
@@ -184,7 +189,6 @@ def execute_triton(args):
                     "--allow-gpu-metrics=false",
                     "--allow-metrics=true",
                     "--metrics-interval-ms=1000",
-                    f"--id=rank{i}",
                 ]
 
                 if args.verbose > 0:
@@ -198,14 +202,11 @@ def execute_triton(args):
                     "--allow-http=false",
                     "--allow-grpc=false",
                     "--allow-metrics=false",
+                    "--log-info=false",
+                    "--log-warning=false",
+                    "--model-control-mode=explicit",
+                    "--load-model=tensorrt_llm",
                 ]
-                cmd_args += ["--log-info=false", "--log-warning=false"]
-
-            cmd_args += [
-                "--disable-auto-complete-config",
-                f"--backend-config=python,shm-region-prefix-name=rank{i}_",
-                ":",
-            ]
 
     result = run_command(cmd_args)
     exit(result)