triton-inference-server · oandreeva-nv · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/AI_Agents_Guide/Constrained_Decoding/README.md b/AI_Agents_Guide/Constrained_Decoding/README.md
@@ -469,7 +469,7 @@ class TritonPythonModel:
 
     def get_executor_config(self, model_config):
 +       tokenizer_dir = model_config['parameters']['tokenizer_dir']['string_value']
-+       logits_lmfe_processor = LMFELogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema())
++       logits_processor = LMFELogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema())
         kwargs = {
             "max_beam_width":
             get_parameter(model_config, "max_beam_width", int),
@@ -491,7 +491,7 @@ class TritonPythonModel:
             "decoding_config":
             self.get_decoding_config(model_config),
 +            "logits_post_processor_map":{
-+                LMFELogitsProcessor.PROCESSOR_NAME: logits_lmfe_processor
++                LMFELogitsProcessor.PROCESSOR_NAME: logits_processor
 +            }
         }
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
@@ -602,7 +602,7 @@ class TritonPythonModel:
 
     def get_executor_config(self, model_config):
 +       tokenizer_dir = model_config['parameters']['tokenizer_dir']['string_value']
-+       logits_lmfe_processor = OutlinesLogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema())
++       logits_processor = OutlinesLogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema())
         kwargs = {
             "max_beam_width":
             get_parameter(model_config, "max_beam_width", int),
@@ -624,7 +624,7 @@ class TritonPythonModel:
             "decoding_config":
             self.get_decoding_config(model_config),
 +            "logits_post_processor_map":{
-+                OutlinesLogitsProcessor.PROCESSOR_NAME: logits_lmfe_processor
++                OutlinesLogitsProcessor.PROCESSOR_NAME: logits_processor
 +            }
         }
         kwargs = {k: v for k, v in kwargs.items() if v is not None}

diff --git a/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md b/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md
@@ -174,6 +174,13 @@ python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt tri
 
 3.  Launch Tritonserver
 
+> [!NOTE]
+> This tutorial was prepared for serving a TensorRT-LLM model on a single GPU.
+> Thus, in the following command use `--world_size=1` if the engine was built
+> for a single GPU. Alternatively, if the engine requires multiple GPUs
+> make sure to specify the exact number of GPUs required by the engine
+> in `--world_size`.
+
 Use the [launch_triton_server.py](https://github.com/triton-inference-server/tensorrtllm_backend/blob/release/0.5.0/scripts/launch_triton_server.py) script. This launches multiple instances of `tritonserver` with MPI.
 ```bash
 python3 /tensorrtllm_backend/scripts/launch_triton_server.py --world_size=<world size of the engine> --model_repo=/opt/tritonserver/inflight_batcher_llm