From 3b7a99ed9db7db9dfd3bdc16eebba4356182025d Mon Sep 17 00:00:00 2001
From: oandreeva-nv <oandreeva@nvidia.com>
Date: Tue, 24 Sep 2024 14:46:29 -0700
Subject: [PATCH 1/2] Eliminated confusions, clarified flags

---
 AI_Agents_Guide/Constrained_Decoding/README.md         | 8 ++++----
 Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md | 7 +++++++
 2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/AI_Agents_Guide/Constrained_Decoding/README.md b/AI_Agents_Guide/Constrained_Decoding/README.md
index 28e07417..f2773500 100644
--- a/AI_Agents_Guide/Constrained_Decoding/README.md
+++ b/AI_Agents_Guide/Constrained_Decoding/README.md
@@ -469,7 +469,7 @@ class TritonPythonModel:
 
     def get_executor_config(self, model_config):
 +       tokenizer_dir = model_config['parameters']['tokenizer_dir']['string_value']
-+       logits_lmfe_processor = LMFELogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema())
++       logits_processor = LMFELogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema())
         kwargs = {
             "max_beam_width":
             get_parameter(model_config, "max_beam_width", int),
@@ -491,7 +491,7 @@ class TritonPythonModel:
             "decoding_config":
             self.get_decoding_config(model_config),
 +            "logits_post_processor_map":{
-+                LMFELogitsProcessor.PROCESSOR_NAME: logits_lmfe_processor
++                LMFELogitsProcessor.PROCESSOR_NAME: logits_processor
 +            }
         }
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
@@ -602,7 +602,7 @@ class TritonPythonModel:
 
     def get_executor_config(self, model_config):
 +       tokenizer_dir = model_config['parameters']['tokenizer_dir']['string_value']
-+       logits_lmfe_processor = OutlinesLogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema())
++       logits_processor = OutlinesLogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema())
         kwargs = {
             "max_beam_width":
             get_parameter(model_config, "max_beam_width", int),
@@ -624,7 +624,7 @@ class TritonPythonModel:
             "decoding_config":
             self.get_decoding_config(model_config),
 +            "logits_post_processor_map":{
-+                OutlinesLogitsProcessor.PROCESSOR_NAME: logits_lmfe_processor
++                OutlinesLogitsProcessor.PROCESSOR_NAME: logits_processor
 +            }
         }
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
diff --git a/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md b/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md
index c5e9d77c..17036357 100644
--- a/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md
+++ b/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md
@@ -174,6 +174,13 @@ python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt tri
 
 3.  Launch Tritonserver
 
+> [!NOTE]
+> This tutorial was prepared for serving TensorRT-LLM model on a single GPU.
+> Thus, in the following command use `--world_size=1`, if you also built
+> an engine for a single GPU. Alternatively, if the engine requires multi-GPU
+> scenario, make sure to specify the exact number of GPU, required by you engine
+> in `--world_size`.
+
 Use the [launch_triton_server.py](https://github.com/triton-inference-server/tensorrtllm_backend/blob/release/0.5.0/scripts/launch_triton_server.py) script. This launches multiple instances of `tritonserver` with MPI.
 ```bash
 python3 /tensorrtllm_backend/scripts/launch_triton_server.py --world_size=<world size of the engine> --model_repo=/opt/tritonserver/inflight_batcher_llm

From 6d70032d32d3e026e66bea81c66642d8eb36dd1c Mon Sep 17 00:00:00 2001
From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:10:27 -0700
Subject: [PATCH 2/2] Update
 Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md

Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
---
 Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md b/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md
index 17036357..f6ff491b 100644
--- a/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md
+++ b/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md
@@ -175,10 +175,10 @@ python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt tri
 3.  Launch Tritonserver
 
 > [!NOTE]
-> This tutorial was prepared for serving TensorRT-LLM model on a single GPU.
-> Thus, in the following command use `--world_size=1`, if you also built
-> an engine for a single GPU. Alternatively, if the engine requires multi-GPU
-> scenario, make sure to specify the exact number of GPU, required by you engine
+> This tutorial was prepared for serving a TensorRT-LLM model on a single GPU.
+> Thus, in the following command use `--world_size=1` if the engine was built
+> for a single GPU. Alternatively, if the engine requires multiple GPUs
+> make sure to specify the exact number of GPUs required by the engine
 > in `--world_size`.
 
 Use the [launch_triton_server.py](https://github.com/triton-inference-server/tensorrtllm_backend/blob/release/0.5.0/scripts/launch_triton_server.py) script. This launches multiple instances of `tritonserver` with MPI.