From db888f1aca588a10f5e4a4b02a4e4ff60d437b6f Mon Sep 17 00:00:00 2001 From: AndyDai-nv Date: Fri, 12 Jul 2024 10:33:03 -0700 Subject: [PATCH] Update GAP tutorial of vllm backend (#743) * Update GAP tutorial to be testable --------- Co-authored-by: tgerdes Co-authored-by: Timothy Gerdes <50968584+tgerdesnv@users.noreply.github.com> Co-authored-by: David Yastremsky <58150256+dyastremsky@users.noreply.github.com> --- .../perf_analyzer/genai-perf/docs/tutorial.md | 113 +++++++----------- 1 file changed, 44 insertions(+), 69 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/docs/tutorial.md b/src/c++/perf_analyzer/genai-perf/docs/tutorial.md index 6d6f3e301..1a37baf39 100644 --- a/src/c++/perf_analyzer/genai-perf/docs/tutorial.md +++ b/src/c++/perf_analyzer/genai-perf/docs/tutorial.md @@ -30,57 +30,47 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - [Profile GPT2 running on Triton + TensorRT-LLM](#tensorrt-llm) - [Profile GPT2 running on Triton + vLLM](#triton-vllm) -- [Profile GPT2 running on OpenAI API-Compatible Server](#openai) +- [Profile GPT2 running on OpenAI Chat Completions API-Compatible Server](#openai-chat) +- [Profile GPT2 running on OpenAI Completions API-Compatible Server](#openai-completions) --- ## Profile GPT2 running on Triton + TensorRT-LLM -### Running GPT2 on Triton Inference Server using TensorRT-LLM +### Run GPT2 on Triton Inference Server using TensorRT-LLM
See instructions -1. Run Triton Inference Server with TensorRT-LLM backend container: +Run Triton Inference Server with TensorRT-LLM backend container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" -docker run -it --net=host --rm --gpus=all --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-trtllm-python-py3 -``` - -2. Install Triton CLI (~5 min): +docker run -it --net=host --gpus=all --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-trtllm-python-py3 -```bash +# Install Triton CLI (~5 min): pip install "git+https://github.com/triton-inference-server/triton_cli@0.0.8" -``` -3. Download model: - -```bash +# Download model: triton import -m gpt2 --backend tensorrtllm -``` -4. Run server: - -```bash +# Run server: triton start ```
-### Running GenAI-Perf +### Run GenAI-Perf -1. Run Triton Inference Server SDK container: +Run GenAI-Perf from Triton Inference Server SDK container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" - -docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk -``` +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" -2. Run GenAI-Perf: +docker run -it --net=host --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +# Run GenAI-Perf in the container: ```bash genai-perf profile \ -m gpt2 \ @@ -120,51 +110,41 @@ Request throughput (per sec): 4.44 ## Profile GPT2 running on Triton + vLLM -### Running GPT2 on Triton Inference Server using vLLM +### Run GPT2 on Triton Inference Server using vLLM
See instructions -1. Run Triton Inference Server with vLLM backend container: +Run Triton Inference Server with vLLM backend container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" -docker run -it --net=host --rm --gpus=all --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-vllm-python-py3 -``` -2. Install Triton CLI (~5 min): +docker run -it --net=host --gpus=1 --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-vllm-python-py3 -```bash +# Install Triton CLI (~5 min): pip install "git+https://github.com/triton-inference-server/triton_cli@0.0.8" -``` - -3. Download model: -```bash +# Download model: triton import -m gpt2 --backend vllm -``` - -4. Run server: -```bash +# Run server: triton start ```
-### Running GenAI-Perf +### Run GenAI-Perf -1. Run Triton Inference Server SDK container: +Run GenAI-Perf from Triton Inference Server SDK container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" -docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk -``` - -2. Run GenAI-Perf: +docker run -it --net=host --gpus=1 nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +# Run GenAI-Perf in the container: ```bash genai-perf profile \ -m gpt2 \ @@ -202,35 +182,31 @@ Output token throughput (per sec): 290.24 Request throughput (per sec): 2.57 ``` -## Profile GPT2 running on OpenAI API-Compatible Server - -### OpenAI Chat Completions API +## Profile GPT2 running on OpenAI Chat API-Compatible Server -#### Running GPT2 on [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)-compatible server +### Run GPT2 on [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)-compatible server
See instructions -1. Run the vLLM inference server: +Run the vLLM inference server: ```bash -docker run -it --net=host --rm --gpus=all vllm/vllm-openai:latest --model gpt2 --dtype float16 --max-model-len 1024 +docker run -it --net=host --gpus=all vllm/vllm-openai:latest --model gpt2 --dtype float16 --max-model-len 1024 ```
-#### Running GenAI-Perf +### Run GenAI-Perf -1. Run Triton Inference Server SDK container: +Run GenAI-Perf from Triton Inference Server SDK container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" -docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk -``` - -2. Run GenAI-Perf: +docker run -it --net=host --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +# Run GenAI-Perf in the container: ```bash genai-perf profile \ -m gpt2 \ @@ -268,33 +244,32 @@ Output token throughput (per sec): 401.62 Request throughput (per sec): 3.52 ``` -### OpenAI Completions API +## Profile GPT2 running on OpenAI Completions API-Compatible Server -#### Running GPT2 on [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions)-compatible server +### Running GPT2 on [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions)-compatible server
See instructions -1. Run the vLLM inference server: +Run the vLLM inference server: ```bash -docker run -it --net=host --rm --gpus=all vllm/vllm-openai:latest --model gpt2 --dtype float16 --max-model-len 1024 +docker run -it --net=host --gpus=all vllm/vllm-openai:latest --model gpt2 --dtype float16 --max-model-len 1024 ```
-#### Running GenAI-Perf +### Run GenAI-Perf -1. Run Triton Inference Server SDK container: +Run GenAI-Perf from Triton Inference Server SDK container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" -docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk -``` +docker run -it --net=host --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk -2. Run GenAI-Perf: +# Run GenAI-Perf in the container: ```bash genai-perf profile \ -m gpt2 \