From 4bbcf0582797805149ca40d55ed40a1fb9c48d8d Mon Sep 17 00:00:00 2001
From: Matthew Kotila <matthew.r.kotila@gmail.com>
Date: Thu, 28 Sep 2023 01:00:44 +0000
Subject: [PATCH] Add continus batch size benchmark to LLM guide

---
 src/c++/perf_analyzer/docs/llm.md | 75 +++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/src/c++/perf_analyzer/docs/llm.md b/src/c++/perf_analyzer/docs/llm.md
index a9fad628a..1d73f74ec 100644
--- a/src/c++/perf_analyzer/docs/llm.md
+++ b/src/c++/perf_analyzer/docs/llm.md
@@ -194,3 +194,78 @@ python3 examples/calculate_avg_token_to_token_latency.py
 ```
 
 #### 6. Repeat steps 3-5 with different prompt lengths to measure effects of initial prompt size (prefill) on token-to-token latency (generation).
+
+### Benchmark 3: Profiling Continuous Batch Size
+
+In this benchmarking scenario, we want to measure the effect of continuous
+batch size on token-to-token latency. We systematically issue requests to the
+server of fixed input sizes and request the model to compute a fixed amount of
+tokens in order to increase the continuous batching size over time.
+
+#### (Optional) Stop Triton Server if already running
+
+```bash
+pkill tritonserver
+```
+
+#### 1. Run the following commands to set the `max_tokens` to `16` and `ignore_eos` to `true`
+
+```bash
+PATH_TO_MODEL_PY="model_repository/vllm/1/model.py"
+MAX_TOKENS=100
+sed -i "128s/.*/\ \ \ \ \ \ \ \ params_dict[\"max_tokens\"] = ${MAX_TOKENS}/" ${PATH_TO_MODEL_PY}
+sed -i "128s/.*/\ \ \ \ \ \ \ \ params_dict[\"ignore_eos\"] = True" ${PATH_TO_MODEL_PY}
+```
+
+#### 2. Start Triton Server
+
+```bash
+docker run --gpus all -it --rm -p 8001:8001 --shm-size=1G --ulimit memlock=-1 --ulimit stack=67108864 -v ${PWD}:/work -w /work tritonserver_vllm tritonserver --model-store ./model_repository
+# this will run continuously in the current shell
+```
+
+#### 3. Generate prompts input data JSON
+
+```bash
+# open a new shell in the same directory you were in when running the above command
+echo '
+{
+    "data": [
+        {
+            "PROMPT": [
+                "Hello, my name is"
+            ],
+            "STREAM": [
+                true
+            ]
+        }
+    ]
+}
+' > prompts.json
+```
+
+#### 4. Run Perf Analyzer
+
+```bash
+perf_analyzer \
+    -m vllm \
+    -i grpc \
+    --async \
+    --streaming \
+    --input-data=prompts.json \
+    --profile-export-file=profile_export.json \
+    --measurement-mode=count_windows \
+    --measurement-request-count=10 \
+    --stability-percentage=999
+    --periodic-concurrency-range=1:20:1
+    --request-period=10
+```
+
+#### 5. Calculate average token-to-token latency
+
+```bash
+python3 examples/calculate_avg_token_to_token_latency.py
+# Average token-to-token latency: 0.003090155677419355 s
+```
+
+#### 6. Repeat steps 4-5 with different period concurrency range start/end/step and different request period to measure effects of continuous batch size on token-to-token latency (generation).