diff --git a/AI_Agents_Guide/Constrained_Decoding/README.md b/AI_Agents_Guide/Constrained_Decoding/README.md index 28e07417..f2773500 100644 --- a/AI_Agents_Guide/Constrained_Decoding/README.md +++ b/AI_Agents_Guide/Constrained_Decoding/README.md @@ -469,7 +469,7 @@ class TritonPythonModel: def get_executor_config(self, model_config): + tokenizer_dir = model_config['parameters']['tokenizer_dir']['string_value'] -+ logits_lmfe_processor = LMFELogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema()) ++ logits_processor = LMFELogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema()) kwargs = { "max_beam_width": get_parameter(model_config, "max_beam_width", int), @@ -491,7 +491,7 @@ class TritonPythonModel: "decoding_config": self.get_decoding_config(model_config), + "logits_post_processor_map":{ -+ LMFELogitsProcessor.PROCESSOR_NAME: logits_lmfe_processor ++ LMFELogitsProcessor.PROCESSOR_NAME: logits_processor + } } kwargs = {k: v for k, v in kwargs.items() if v is not None} @@ -602,7 +602,7 @@ class TritonPythonModel: def get_executor_config(self, model_config): + tokenizer_dir = model_config['parameters']['tokenizer_dir']['string_value'] -+ logits_lmfe_processor = OutlinesLogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema()) ++ logits_processor = OutlinesLogitsProcessor(tokenizer_dir, AnswerFormat.model_json_schema()) kwargs = { "max_beam_width": get_parameter(model_config, "max_beam_width", int), @@ -624,7 +624,7 @@ class TritonPythonModel: "decoding_config": self.get_decoding_config(model_config), + "logits_post_processor_map":{ -+ OutlinesLogitsProcessor.PROCESSOR_NAME: logits_lmfe_processor ++ OutlinesLogitsProcessor.PROCESSOR_NAME: logits_processor + } } kwargs = {k: v for k, v in kwargs.items() if v is not None} diff --git a/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md b/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md index c5e9d77c..f6ff491b 100644 --- a/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md +++ b/Popular_Models_Guide/Hermes-2-Pro-Llama-3-8B/README.md @@ -174,6 +174,13 @@ python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt tri 3. Launch Tritonserver +> [!NOTE] +> This tutorial was prepared for serving a TensorRT-LLM model on a single GPU. +> Thus, in the following command use `--world_size=1` if the engine was built +> for a single GPU. Alternatively, if the engine requires multiple GPUs +> make sure to specify the exact number of GPUs required by the engine +> in `--world_size`. + Use the [launch_triton_server.py](https://github.com/triton-inference-server/tensorrtllm_backend/blob/release/0.5.0/scripts/launch_triton_server.py) script. This launches multiple instances of `tritonserver` with MPI. ```bash python3 /tensorrtllm_backend/scripts/launch_triton_server.py --world_size= --model_repo=/opt/tritonserver/inflight_batcher_llm