Skip to content

Commit

Permalink
Gen AI Tutorial: Remove VLLM option for initial helm chart
Browse files Browse the repository at this point in the history
This change removes the option of deploying w/ VLLM for the initial version of the tutorial.
The option was removed due to several deployment bugs it introduced.
  • Loading branch information
whoisj committed Jun 7, 2024
1 parent 9bf511e commit 0c1b414
Show file tree
Hide file tree
Showing 9 changed files with 25 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# See values.yaml for reference values.

gpu:
- Tesla-T4
- Tesla-V100-SXM2-16GB
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# See values.yaml for reference values.

gpu:
- Tesla-T4
- Tesla-V100-SXM2-16GB

model:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
# See values.yaml for reference values.

gpu:
- NVIDIA-A10G
- NVIDIA-A100-SXM4-40GB
- Tesla-T4
- Tesla-V100-SXM2-16GB

model:
name: llama-2-7b
tensorrtLlm:
parallelism:
tensor: 2
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# See values.yaml for reference values.

gpu:
- Tesla-T4
- Tesla-V100-SXM2-16GB

model:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
# See values.yaml for reference values.

gpu:
- Tesla-V100-SXM2-16GB
- Tesla-T4
- Tesla-V100-SXM2-16GB

model:
name: opt125m
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
{{- $model_dt := "float16" }}
{{- $model_pp := 1 }}
{{- $model_tp := 1 }}
{{- $model_trtllm := true }}
{{- with $.Values.kubernetes }}
{{- with .hostRootPath }}
{{- $hostRootPath = . }}
Expand All @@ -36,7 +35,6 @@
{{- with $.Values.model }}
{{- $model_name = required "Property '.model.name' is required." .name }}
{{- with .tensorrtLlm }}
{{- $model_trtllm = .enable }}
{{- with .dataType }}
{{- $model_dt = . }}
{{- end }}
Expand Down Expand Up @@ -123,14 +121,10 @@ spec:
- python3
- ./server.py
- exec
{{- if $model_trtllm }}
- --engine=trtllm
- --dt={{ $model_dt }}
- --pp={{ $model_pp }}
- --tp={{ $model_tp }}
{{- else }}
- --engine=vllm
{{- end }}
{{- with $.Values.logging }}
{{- with .tritonServer }}
{{- if .useIso8601 }}
Expand Down Expand Up @@ -191,11 +185,9 @@ spec:
memory: {{ $triton_memory }}
nvidia.com/gpu: {{ $model_gpus }}
volumeMounts:
{{- if $model_trtllm }}
- mountPath: /var/run/engines
name: engine-repository
readOnly: false
{{- end }}
- mountPath: /var/run/models
name: model-repository
readOnly: true
Expand All @@ -217,14 +209,10 @@ spec:
- ./server.py
- init
- --model={{ $model_lower }}
{{- if $model_trtllm }}
- --engine=trtllm
- --dt={{ $model_dt }}
- --pp={{ $model_pp }}
- --tp={{ $model_tp }}
{{- else }}
- --engine=vllm
{{- end }}
{{- with $.Values.logging }}
{{- with .initialization }}
{{- if .verbose }}
Expand Down Expand Up @@ -267,11 +255,9 @@ spec:
ephemeral-storage: 96Gi
nvidia.com/gpu: {{ $model_gpus }}
volumeMounts:
{{- if $model_trtllm }}
- mountPath: /var/run/engines
name: engine-repository
readOnly: false
{{- end }}
- mountPath: /var/run/models
name: model-repository
readOnly: false
Expand All @@ -297,19 +283,13 @@ spec:
{{- end }}
{{- end }}
volumes:
{{- if $model_trtllm }}
- name: engine-repository
hostPath:
path: {{ printf "%s/models/%s/%dx%d/engines" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }}
type: DirectoryOrCreate
{{- end }}
- name: model-repository
hostPath:
{{- if $model_trtllm }}
path: {{ printf "%s/models/%s/%dx%d/models" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }}
{{- else }}
path: {{ printf "%s/models/%s/vllm" $hostRootPath $model_lower }}
{{- end }}
type: DirectoryOrCreate
{{- with $.Values.model }}
{{- with .pullSecret }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,7 @@
"enable": {
"description": "When `true`, enables conversion of models into TensorRT format before loading them into Triton Server.",
"oneOf": [
{
"type": "boolean"
},
{ "type": "boolean" },
{ "type": "null" }
]
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ model: # (required)
name: # (required)
# Configuration options related to the conversion of a non-optimized model into TensorRT format.
tensorrtLlm: # (optional)
# When `true`, enables conversion of models into TensorRT format before loading them into Triton Server.
# When 'false', the init container will fall back to vLLM and parallelism options are ignored.
enable: # (default: true)
# Data type used when compiling and optimizing the model for TensorRT.
# Supported options are float16, bfloat16, float32
dataType: # (default: float16)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ def hugging_face_authenticate(args):
hugging_face_token = token_file.read()

# Use Hugging Face's CLI to complete the authentication.
result = run_command([HUGGING_FACE_CLI, "login", "--token"], [hugging_face_token])
result = run_command(
[HUGGING_FACE_CLI, "login", "--token"], [hugging_face_token]
)

if result != 0:
raise Exception(f"Hugging Face authentication failed. ({result})")
Expand Down Expand Up @@ -165,17 +167,20 @@ def execute_triton(args):
cmd_args = ["mpirun", "--allow-run-as-root"]

for i in range(world_size):
if i != 0:
cmd_args += [":"]

cmd_args += [
"-n",
"1",
"tritonserver",
f"--model-repository={MODEL_DIRECTORY}",
"--disable-auto-complete-config",
]
cmd_args += [
f"--id=rank{i}",
f"--http-port={(8000 + i * 10)}",
f"--grpc-port={(8001 + i * 10)}",
"--model-load-thread-count=2",
f"--model-repository={MODEL_DIRECTORY}",
"--disable-auto-complete-config",
f"--backend-config=python,shm-region-prefix-name=rank{i}_",
]

if i == 0:
Expand All @@ -184,7 +189,6 @@ def execute_triton(args):
"--allow-gpu-metrics=false",
"--allow-metrics=true",
"--metrics-interval-ms=1000",
f"--id=rank{i}",
]

if args.verbose > 0:
Expand All @@ -198,14 +202,11 @@ def execute_triton(args):
"--allow-http=false",
"--allow-grpc=false",
"--allow-metrics=false",
"--log-info=false",
"--log-warning=false",
"--model-control-mode=explicit",
"--load-model=tensorrt_llm",
]
cmd_args += ["--log-info=false", "--log-warning=false"]

cmd_args += [
"--disable-auto-complete-config",
f"--backend-config=python,shm-region-prefix-name=rank{i}_",
":",
]

result = run_command(cmd_args)
exit(result)
Expand Down

0 comments on commit 0c1b414

Please sign in to comment.