diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml index f303717b..025f1a73 100644 --- a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +# See values.yaml for reference values. + gpu: - Tesla-T4 - Tesla-V100-SXM2-16GB diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml index 58239a71..1e2d16db 100644 --- a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml @@ -15,6 +15,7 @@ # See values.yaml for reference values. gpu: +- Tesla-T4 - Tesla-V100-SXM2-16GB model: diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml index ce27e694..66d25eca 100644 --- a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml @@ -15,8 +15,11 @@ # See values.yaml for reference values. gpu: -- NVIDIA-A10G -- NVIDIA-A100-SXM4-40GB +- Tesla-T4 +- Tesla-V100-SXM2-16GB model: name: llama-2-7b + tensorrtLlm: + parallelism: + tensor: 2 diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml index 1f31ffbb..51237c55 100644 --- a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml @@ -15,6 +15,7 @@ # See values.yaml for reference values. gpu: +- Tesla-T4 - Tesla-V100-SXM2-16GB model: diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml index 7975b559..4c5f4243 100644 --- a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml @@ -15,8 +15,8 @@ # See values.yaml for reference values. gpu: -- Tesla-V100-SXM2-16GB - Tesla-T4 +- Tesla-V100-SXM2-16GB model: name: opt125m diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/deployment.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/deployment.yaml index d5c58f48..4affb6e6 100644 --- a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/deployment.yaml +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/deployment.yaml @@ -27,7 +27,6 @@ {{- $model_dt := "float16" }} {{- $model_pp := 1 }} {{- $model_tp := 1 }} -{{- $model_trtllm := true }} {{- with $.Values.kubernetes }} {{- with .hostRootPath }} {{- $hostRootPath = . }} @@ -36,7 +35,6 @@ {{- with $.Values.model }} {{- $model_name = required "Property '.model.name' is required." .name }} {{- with .tensorrtLlm }} -{{- $model_trtllm = .enable }} {{- with .dataType }} {{- $model_dt = . }} {{- end }} @@ -123,14 +121,10 @@ spec: - python3 - ./server.py - exec -{{- if $model_trtllm }} - --engine=trtllm - --dt={{ $model_dt }} - --pp={{ $model_pp }} - --tp={{ $model_tp }} -{{- else }} - - --engine=vllm -{{- end }} {{- with $.Values.logging }} {{- with .tritonServer }} {{- if .useIso8601 }} @@ -191,11 +185,9 @@ spec: memory: {{ $triton_memory }} nvidia.com/gpu: {{ $model_gpus }} volumeMounts: -{{- if $model_trtllm }} - mountPath: /var/run/engines name: engine-repository readOnly: false -{{- end }} - mountPath: /var/run/models name: model-repository readOnly: true @@ -217,14 +209,10 @@ spec: - ./server.py - init - --model={{ $model_lower }} -{{- if $model_trtllm }} - --engine=trtllm - --dt={{ $model_dt }} - --pp={{ $model_pp }} - --tp={{ $model_tp }} -{{- else }} - - --engine=vllm -{{- end }} {{- with $.Values.logging }} {{- with .initialization }} {{- if .verbose }} @@ -267,11 +255,9 @@ spec: ephemeral-storage: 96Gi nvidia.com/gpu: {{ $model_gpus }} volumeMounts: -{{- if $model_trtllm }} - mountPath: /var/run/engines name: engine-repository readOnly: false -{{- end }} - mountPath: /var/run/models name: model-repository readOnly: false @@ -297,19 +283,13 @@ spec: {{- end }} {{- end }} volumes: -{{- if $model_trtllm }} - name: engine-repository hostPath: path: {{ printf "%s/models/%s/%dx%d/engines" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }} type: DirectoryOrCreate -{{- end }} - name: model-repository hostPath: -{{- if $model_trtllm }} path: {{ printf "%s/models/%s/%dx%d/models" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }} -{{- else }} - path: {{ printf "%s/models/%s/vllm" $hostRootPath $model_lower }} -{{- end }} type: DirectoryOrCreate {{- with $.Values.model }} {{- with .pullSecret }} diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.schema.json b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.schema.json index bb911ca7..d815f6fd 100644 --- a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.schema.json +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.schema.json @@ -54,9 +54,7 @@ "enable": { "description": "When `true`, enables conversion of models into TensorRT format before loading them into Triton Server.", "oneOf": [ - { - "type": "boolean" - }, + { "type": "boolean" }, { "type": "null" } ] }, diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.yaml index 114c10f4..31a7495a 100644 --- a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.yaml +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.yaml @@ -39,9 +39,6 @@ model: # (required) name: # (required) # Configuration options related to the conversion of a non-optimized model into TensorRT format. tensorrtLlm: # (optional) - # When `true`, enables conversion of models into TensorRT format before loading them into Triton Server. - # When 'false', the init container will fall back to vLLM and parallelism options are ignored. - enable: # (default: true) # Data type used when compiling and optimizing the model for TensorRT. # Supported options are float16, bfloat16, float32 dataType: # (default: float16) diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.py b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.py index f1d7228e..535aad62 100644 --- a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.py +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.py @@ -88,7 +88,9 @@ def hugging_face_authenticate(args): hugging_face_token = token_file.read() # Use Hugging Face's CLI to complete the authentication. - result = run_command([HUGGING_FACE_CLI, "login", "--token"], [hugging_face_token]) + result = run_command( + [HUGGING_FACE_CLI, "login", "--token"], [hugging_face_token] + ) if result != 0: raise Exception(f"Hugging Face authentication failed. ({result})") @@ -165,17 +167,20 @@ def execute_triton(args): cmd_args = ["mpirun", "--allow-run-as-root"] for i in range(world_size): + if i != 0: + cmd_args += [":"] + cmd_args += [ "-n", "1", "tritonserver", - f"--model-repository={MODEL_DIRECTORY}", - "--disable-auto-complete-config", - ] - cmd_args += [ + f"--id=rank{i}", f"--http-port={(8000 + i * 10)}", f"--grpc-port={(8001 + i * 10)}", "--model-load-thread-count=2", + f"--model-repository={MODEL_DIRECTORY}", + "--disable-auto-complete-config", + f"--backend-config=python,shm-region-prefix-name=rank{i}_", ] if i == 0: @@ -184,7 +189,6 @@ def execute_triton(args): "--allow-gpu-metrics=false", "--allow-metrics=true", "--metrics-interval-ms=1000", - f"--id=rank{i}", ] if args.verbose > 0: @@ -198,14 +202,11 @@ def execute_triton(args): "--allow-http=false", "--allow-grpc=false", "--allow-metrics=false", + "--log-info=false", + "--log-warning=false", + "--model-control-mode=explicit", + "--load-model=tensorrt_llm", ] - cmd_args += ["--log-info=false", "--log-warning=false"] - - cmd_args += [ - "--disable-auto-complete-config", - f"--backend-config=python,shm-region-prefix-name=rank{i}_", - ":", - ] result = run_command(cmd_args) exit(result)