From 3b7a99ed9db7db9dfd3bdc16eebba4356182025d Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Tue, 24 Sep 2024 14:46:29 -0700 Subject: [PATCH 1/2] Eliminated confusions, clarified flags --- AI_Agents_Guide/Constrained_Decoding/README.md | 8 ++++---- Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md | 7 +++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/AI_Agents_Guide/Constrained_Decoding/README.md b/AI_Agents_Guide/Constrained_Decoding/README.md index 28e07417..f2773500 100644 --- a/AI_Agents_Guide/Constrained_Decoding/README.md +++ b/AI_Agents_Guide/Constrained_Decoding/README.md @@ -469,7 +469,7 @@ class TritonPythonModel: def get_executor_config(self, model_config): + tokenizer_dir = model_config['parameters']['tokenizer_dir']['string_value'] -+ logits_lmfe_processor = LMFELogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema()) ++ logits_processor = LMFELogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema()) kwargs = { "max_beam_width": get_parameter(model_config, "max_beam_width", int), @@ -491,7 +491,7 @@ class TritonPythonModel: "decoding_config": self.get_decoding_config(model_config), + "logits_post_processor_map":{ -+ LMFELogitsProcessor.PROCESSOR_NAME: logits_lmfe_processor ++ LMFELogitsProcessor.PROCESSOR_NAME: logits_processor + } } kwargs = {k: v for k, v in kwargs.items() if v is not None} @@ -602,7 +602,7 @@ class TritonPythonModel: def get_executor_config(self, model_config): + tokenizer_dir = model_config['parameters']['tokenizer_dir']['string_value'] -+ logits_lmfe_processor = OutlinesLogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema()) ++ logits_processor = OutlinesLogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema()) kwargs = { "max_beam_width": get_parameter(model_config, "max_beam_width", int), @@ -624,7 +624,7 @@ class TritonPythonModel: "decoding_config": self.get_decoding_config(model_config), + "logits_post_processor_map":{ -+ OutlinesLogitsProcessor.PROCESSOR_NAME: logits_lmfe_processor ++ OutlinesLogitsProcessor.PROCESSOR_NAME: logits_processor + } } kwargs = {k: v for k, v in kwargs.items() if v is not None} diff --git a/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md b/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md index c5e9d77c..17036357 100644 --- a/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md +++ b/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md @@ -174,6 +174,13 @@ python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt tri 3. Launch Tritonserver +> [!NOTE] +> This tutorial was prepared for serving TensorRT-LLM model on a single GPU. +> Thus, in the following command use `--world_size=1`, if you also built +> an engine for a single GPU. Alternatively, if the engine requires multi-GPU +> scenario, make sure to specify the exact number of GPU, required by you engine +> in `--world_size`. + Use the [launch_triton_server.py](https://github.com/triton-inference-server/tensorrtllm_backend/blob/release/0.5.0/scripts/launch_triton_server.py) script. This launches multiple instances of `tritonserver` with MPI. ```bash python3 /tensorrtllm_backend/scripts/launch_triton_server.py --world_size= --model_repo=/opt/tritonserver/inflight_batcher_llm From 6d70032d32d3e026e66bea81c66642d8eb36dd1c Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Tue, 24 Sep 2024 15:10:27 -0700 Subject: [PATCH 2/2] Update Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md Co-authored-by: Ryan McCormick --- Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md b/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md index 17036357..f6ff491b 100644 --- a/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md +++ b/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md @@ -175,10 +175,10 @@ python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt tri 3. Launch Tritonserver > [!NOTE] -> This tutorial was prepared for serving TensorRT-LLM model on a single GPU. -> Thus, in the following command use `--world_size=1`, if you also built -> an engine for a single GPU. Alternatively, if the engine requires multi-GPU -> scenario, make sure to specify the exact number of GPU, required by you engine +> This tutorial was prepared for serving a TensorRT-LLM model on a single GPU. +> Thus, in the following command use `--world_size=1` if the engine was built +> for a single GPU. Alternatively, if the engine requires multiple GPUs +> make sure to specify the exact number of GPUs required by the engine > in `--world_size`. Use the [launch_triton_server.py](https://github.com/triton-inference-server/tensorrtllm_backend/blob/release/0.5.0/scripts/launch_triton_server.py) script. This launches multiple instances of `tritonserver` with MPI.