update tutorial

triton-inference-server · Dec 21, 2023 · f107526 · f107526
1 parent 3ec82d6
commit f107526
Showing 1 changed file with 31 additions and 29 deletions.
diff --git a/Popular_Models_Guide/Llama2/trtllm_guide.md b/Popular_Models_Guide/Llama2/trtllm_guide.md
@@ -44,7 +44,7 @@ and links for how to run Llama with other backends.
 ## Installation
 
 1. The installation starts with cloning the TensorRT-LLM Backend and update the
-   TensorRT-LLM submodule:
+   TensorRT-LLM submodule. Note the release that has been tested is `v0.6.0`
 ```bash
 git clone https://github.com/triton-inference-server/tensorrtllm_backend.git --branch <release branch>
 # Update the submodules
@@ -62,9 +62,9 @@ git submodule update --init --recursive
 ```bash
 docker run --rm -it --net host --shm-size=2g \
     --ulimit memlock=-1 --ulimit stack=67108864 --gpus all \
-    -v your_path_to/tensorrtllm_backend:/tensorrtllm_backend \
-    -v your_path_to/Llama-2-7b-hf:/Llama-2-7b-hf \
-    -v your_path_to/engines:/engines \
+    -v /your_path_to/tensorrtllm_backend:/tensorrtllm_backend \
+    -v /your_path_to/Llama-2-7b-hf:/Llama-2-7b-hf \
+    -v /your_path_to/engines:/engines \
     nvcr.io/nvidia/tritonserver:23.11-trtllm-python-py3
 
 # Install Sentencepiece
@@ -109,24 +109,27 @@ configuration you want with the following steps:
     example [here](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama/README.md).
 
     ```bash
-    python /tensorrtllm_backend/tensorrt_llm/examples/llama/build.py --model_dir /Llama-2-7b-hf/ \
-                    --dtype float16 \
-                    --use_gpt_attention_plugin float16 \
-                    --use_inflight_batching \
-                    --paged_kv_cache \
-                    --remove_input_padding \
-                    --use_gemm_plugin float16 \
-                    --enable_context_fmha \
-                    --max_batch_size 64 \
-                    --output_dir /engines/1-gpu/ \
-                    --world_size 1
+    BUILD_SCRIPT=/tensorrtllm_backend/tensorrt_llm/examples/llama/build.py
+    export TOKENIZER_DIR=/Llama-2-7b-hf
+    export ENGINE_DIR=/engines/h100/batch_64
+    export MAX_BATCH_SIZE=64
+    python ${BUILD_SCRIPT} \
+                --model_dir ${TOKENIZER_DIR} \
+                --dtype float16 \
+                --remove_input_padding \
+                --use_gpt_attention_plugin float16 \
+                --enable_context_fmha \
+                --use_gemm_plugin float16 \
+                --output_dir ${ENGINE_DIR} \
+                --paged_kv_cache \
+                --max_batch_size ${MAX_BATCH_SIZE}
     ```
 
     > Optional: You can check test the output of the model with `run.py`
     > located in the same llama examples folder.
     >
     >   ```bash
-    >    python3 /tensorrtllm_backend/tensorrt_llm/examples/run.py --engine_dir=/engines/1-gpu/ --max_output_len 100 --tokenizer_dir /Llama-2-7b-hf --input_text "How do I count to ten in French?"
+    >    python3 /tensorrtllm_backend/tensorrt_llm/examples/run.py --engine_dir=${ENGINE_DIR} --max_output_len 100 --tokenizer_dir ${TOKENIZER_DIR} --input_text "How do I count to ten in French?"
     >    ```
 
 ## Serving with Triton
@@ -140,7 +143,7 @@ To run our Llama2-7B model, you will need to:
 
     ```bash
     mkdir -p /opt/tritonserver/model_repository
-    cp -r /tensorrtllm_backend/all_models/inflight_batcher_llm/* /opt/tritonserver/model_repository/.
+    cp -r /work/tensorrtllm_backend/all_models/inflight_batcher_llm/* /opt/tritonserver/model_repository/.
     ```
 
 2. Modify config.pbtxt for the preprocessing, postprocessing and processing steps.
@@ -149,23 +152,22 @@ To run our Llama2-7B model, you will need to:
    provided `fill_template.py` script.
 
     ```bash
-    FILL_TEMPLATE_SCRIPT=/tensorrtllm_backend/tools/fill_template.py
+    FILL_TEMPLATE_SCRIPT=/work/tensorrtllm_backend/tools/fill_template.py
     MODEL_FOLDER=/opt/tritonserver/model_repository
-    TOKENIZER_DIR=/Llama-2-7b-hf
     TOKENIZER_TYPE="llama"
-    ENGINES_DIR=/engines/1-gpu/
-    # Batch size here is same as what we specified in the engine
-    MAX_BATCH_SIZE=64
+    # Batch size here is same as what we specified in the engine as ${MAX_BATCH_SIZE}
     INSTANCE_COUNT=1
-    BATCHING_STRATEGY="inflight_fused_batching"
-
+    BATCHING_STRATEGY="inflight_batching"
     python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/preprocessing/config.pbtxt \
             tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:${TOKENIZER_TYPE},triton_max_batch_size:${MAX_BATCH_SIZE},preprocessing_instance_count:${INSTANCE_COUNT}
     python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/postprocessing/config.pbtxt \
             tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:${TOKENIZER_TYPE},triton_max_batch_size:${MAX_BATCH_SIZE},postprocessing_instance_count:${INSTANCE_COUNT}
-    python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/ensemble/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE}
+    python3 ${FILL_TEMPLATE_SCRIPT} -i  ${MODEL_FOLDER}/tensorrt_llm_bls/config.pbtxt \
+            triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:False,bls_instance_count:${INSTANCE_COUNT},accumulate_tokens:False
+    python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/ensemble/config.pbtxt \
+            triton_max_batch_size:${MAX_BATCH_SIZE}
     python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt \
-            triton_max_batch_size:${MAX_BATCH_SIZE},engine_dir:${ENGINES_DIR},batching_strategy:${BATCHING_STRATEGY},decoupled_mode:False,max_beam_width:1,max_tokens_in_paged_kv_cache:2560,max_kv_cache_length:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,max_queue_delay_microseconds:600
+            triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_DIR},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:${BATCHING_STRATEGY},max_queue_delay_microseconds:600
 ```
 
 1.  Launch Tritonserver
@@ -191,9 +193,9 @@ You can test the results of the run with:
 # Using the SDK container as an example
 docker run --rm -it --net host --shm-size=2g \
     --ulimit memlock=-1 --ulimit stack=67108864 --gpus all \
-    -v /path/to/tensorrtllm_backend:/tensorrtllm_backend \
-    -v /path/to/Llama2/repo:/Llama-2-7b-hf \
-    -v /path/to/engines:/engines \
+    -v /your_path_to/tensorrtllm_backend:/tensorrtllm_backend \
+    -v /your_path_to/Llama2/repo:/Llama-2-7b-hf \
+    -v /your_path_to/engines:/engines \
     nvcr.io/nvidia/tritonserver:23.11-py3-sdk
 # Install extra dependencies for the script
 pip3 install transformers sentencepiece