forked from sgl-project/sglang
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Monitoring documentation (sgl-project#1933)
- Loading branch information
1 parent
a71a44f
commit 5bc2508
Showing
4 changed files
with
1,951 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
# Production Metrics | ||
|
||
sglang exposes the following metrics via Prometheus. The metrics are namespaced by `$name` (the model name). | ||
|
||
An example of the monitoring dashboard is available in [examples/monitoring/grafana.json](../examples/monitoring/grafana.json). | ||
|
||
Here is an example of the metrics: | ||
|
||
``` | ||
# HELP sglang:max_total_num_tokens Maximum total number of tokens | ||
# TYPE sglang:max_total_num_tokens gauge | ||
sglang:max_total_num_tokens{name="google/gemma-2-9b-it"} 161721.0 | ||
# HELP sglang:max_prefill_tokens Maximum prefill tokens | ||
# TYPE sglang:max_prefill_tokens gauge | ||
sglang:max_prefill_tokens{name="google/gemma-2-9b-it"} 16384.0 | ||
# HELP sglang:max_running_requests Maximum running requests | ||
# TYPE sglang:max_running_requests gauge | ||
sglang:max_running_requests{name="google/gemma-2-9b-it"} 4097.0 | ||
# HELP sglang:context_len Context length | ||
# TYPE sglang:context_len gauge | ||
sglang:context_len{name="google/gemma-2-9b-it"} 8192.0 | ||
# HELP sglang:prompt_tokens_total Number of prefill tokens processed. | ||
# TYPE sglang:prompt_tokens_total counter | ||
sglang:prompt_tokens_total{name="google/gemma-2-9b-it"} 506780.0 | ||
# HELP sglang:generation_tokens_total Number of generation tokens processed. | ||
# TYPE sglang:generation_tokens_total counter | ||
sglang:generation_tokens_total{name="google/gemma-2-9b-it"} 424549.0 | ||
# HELP sglang:num_requests_running Number of requests currently running on GPU | ||
# TYPE sglang:num_requests_running gauge | ||
sglang:num_requests_running{name="google/gemma-2-9b-it"} 0.0 | ||
# HELP sglang:num_requests_waiting Number of requests waiting to be processed. | ||
# TYPE sglang:num_requests_waiting gauge | ||
sglang:num_requests_waiting{name="google/gemma-2-9b-it"} 0.0 | ||
# HELP sglang:gen_throughput Gen token throughput (token/s) | ||
# TYPE sglang:gen_throughput gauge | ||
sglang:gen_throughput{name="google/gemma-2-9b-it"} 0.0 | ||
# HELP sglang:token_usage Total token usage | ||
# TYPE sglang:token_usage gauge | ||
sglang:token_usage{name="google/gemma-2-9b-it"} 0.01 | ||
# HELP sglang:new_seq Number of new sequences | ||
# TYPE sglang:new_seq gauge | ||
sglang:new_seq{name="google/gemma-2-9b-it"} 0.0 | ||
# HELP sglang:new_token Number of new token | ||
# TYPE sglang:new_token gauge | ||
sglang:new_token{name="google/gemma-2-9b-it"} 0.0 | ||
# HELP sglang:cached_token Number of cached token | ||
# TYPE sglang:cached_token gauge | ||
sglang:cached_token{name="google/gemma-2-9b-it"} 0.0 | ||
# HELP sglang:cache_hit_rate Cache hit rate | ||
# TYPE sglang:cache_hit_rate gauge | ||
sglang:cache_hit_rate{name="google/gemma-2-9b-it"} 10.61 | ||
# HELP sglang:queue_req Number of queued requests | ||
# TYPE sglang:queue_req gauge | ||
sglang:queue_req{name="google/gemma-2-9b-it"} 0.0 | ||
# HELP sglang:time_to_first_token_seconds Histogram of time to first token in seconds. | ||
# TYPE sglang:time_to_first_token_seconds histogram | ||
sglang:time_to_first_token_seconds_sum{name="google/gemma-2-9b-it"} 656.0780844688416 | ||
sglang:time_to_first_token_seconds_bucket{le="0.001",name="google/gemma-2-9b-it"} 0.0 | ||
sglang:time_to_first_token_seconds_bucket{le="0.005",name="google/gemma-2-9b-it"} 0.0 | ||
sglang:time_to_first_token_seconds_bucket{le="0.01",name="google/gemma-2-9b-it"} 0.0 | ||
sglang:time_to_first_token_seconds_bucket{le="0.02",name="google/gemma-2-9b-it"} 0.0 | ||
sglang:time_to_first_token_seconds_bucket{le="0.04",name="google/gemma-2-9b-it"} 207.0 | ||
sglang:time_to_first_token_seconds_bucket{le="0.06",name="google/gemma-2-9b-it"} 456.0 | ||
sglang:time_to_first_token_seconds_bucket{le="0.08",name="google/gemma-2-9b-it"} 598.0 | ||
sglang:time_to_first_token_seconds_bucket{le="0.1",name="google/gemma-2-9b-it"} 707.0 | ||
sglang:time_to_first_token_seconds_bucket{le="0.25",name="google/gemma-2-9b-it"} 1187.0 | ||
sglang:time_to_first_token_seconds_bucket{le="0.5",name="google/gemma-2-9b-it"} 1350.0 | ||
sglang:time_to_first_token_seconds_bucket{le="0.75",name="google/gemma-2-9b-it"} 2124.0 | ||
sglang:time_to_first_token_seconds_bucket{le="1.0",name="google/gemma-2-9b-it"} 2124.0 | ||
sglang:time_to_first_token_seconds_bucket{le="2.5",name="google/gemma-2-9b-it"} 2124.0 | ||
sglang:time_to_first_token_seconds_bucket{le="5.0",name="google/gemma-2-9b-it"} 2124.0 | ||
sglang:time_to_first_token_seconds_bucket{le="7.5",name="google/gemma-2-9b-it"} 2124.0 | ||
sglang:time_to_first_token_seconds_bucket{le="10.0",name="google/gemma-2-9b-it"} 2124.0 | ||
sglang:time_to_first_token_seconds_bucket{le="15.0",name="google/gemma-2-9b-it"} 2124.0 | ||
sglang:time_to_first_token_seconds_bucket{le="20.0",name="google/gemma-2-9b-it"} 2124.0 | ||
sglang:time_to_first_token_seconds_bucket{le="25.0",name="google/gemma-2-9b-it"} 2124.0 | ||
sglang:time_to_first_token_seconds_bucket{le="30.0",name="google/gemma-2-9b-it"} 2124.0 | ||
sglang:time_to_first_token_seconds_bucket{le="+Inf",name="google/gemma-2-9b-it"} 2124.0 | ||
sglang:time_to_first_token_seconds_count{name="google/gemma-2-9b-it"} 2124.0 | ||
# HELP sglang:time_per_output_token_seconds Histogram of time per output token in seconds. | ||
# TYPE sglang:time_per_output_token_seconds histogram | ||
sglang:time_per_output_token_seconds_sum{name="google/gemma-2-9b-it"} 29846.5393948555 | ||
sglang:time_per_output_token_seconds_bucket{le="0.005",name="google/gemma-2-9b-it"} 0.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.01",name="google/gemma-2-9b-it"} 0.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.015",name="google/gemma-2-9b-it"} 0.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.02",name="google/gemma-2-9b-it"} 9602.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.025",name="google/gemma-2-9b-it"} 30060.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.03",name="google/gemma-2-9b-it"} 39184.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.04",name="google/gemma-2-9b-it"} 61387.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.05",name="google/gemma-2-9b-it"} 78835.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.075",name="google/gemma-2-9b-it"} 139394.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.1",name="google/gemma-2-9b-it"} 422029.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.15",name="google/gemma-2-9b-it"} 422029.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.2",name="google/gemma-2-9b-it"} 422029.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.3",name="google/gemma-2-9b-it"} 422424.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.4",name="google/gemma-2-9b-it"} 422424.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.5",name="google/gemma-2-9b-it"} 422425.0 | ||
sglang:time_per_output_token_seconds_bucket{le="0.75",name="google/gemma-2-9b-it"} 422425.0 | ||
sglang:time_per_output_token_seconds_bucket{le="1.0",name="google/gemma-2-9b-it"} 422425.0 | ||
sglang:time_per_output_token_seconds_bucket{le="2.5",name="google/gemma-2-9b-it"} 422425.0 | ||
sglang:time_per_output_token_seconds_bucket{le="+Inf",name="google/gemma-2-9b-it"} 422425.0 | ||
sglang:time_per_output_token_seconds_count{name="google/gemma-2-9b-it"} 422425.0 | ||
# HELP sglang:request_prompt_tokens Number of prefill tokens processed | ||
# TYPE sglang:request_prompt_tokens histogram | ||
sglang:request_prompt_tokens_sum{name="google/gemma-2-9b-it"} 500552.0 | ||
sglang:request_prompt_tokens_bucket{le="1.0",name="google/gemma-2-9b-it"} 0.0 | ||
sglang:request_prompt_tokens_bucket{le="2.0",name="google/gemma-2-9b-it"} 0.0 | ||
sglang:request_prompt_tokens_bucket{le="5.0",name="google/gemma-2-9b-it"} 22.0 | ||
sglang:request_prompt_tokens_bucket{le="10.0",name="google/gemma-2-9b-it"} 191.0 | ||
sglang:request_prompt_tokens_bucket{le="20.0",name="google/gemma-2-9b-it"} 511.0 | ||
sglang:request_prompt_tokens_bucket{le="50.0",name="google/gemma-2-9b-it"} 825.0 | ||
sglang:request_prompt_tokens_bucket{le="100.0",name="google/gemma-2-9b-it"} 997.0 | ||
sglang:request_prompt_tokens_bucket{le="200.0",name="google/gemma-2-9b-it"} 1182.0 | ||
sglang:request_prompt_tokens_bucket{le="500.0",name="google/gemma-2-9b-it"} 1748.0 | ||
sglang:request_prompt_tokens_bucket{le="1000.0",name="google/gemma-2-9b-it"} 2102.0 | ||
sglang:request_prompt_tokens_bucket{le="2000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_prompt_tokens_bucket{le="5000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_prompt_tokens_bucket{le="10000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_prompt_tokens_bucket{le="20000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_prompt_tokens_bucket{le="50000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_prompt_tokens_bucket{le="100000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_prompt_tokens_bucket{le="+Inf",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_prompt_tokens_count{name="google/gemma-2-9b-it"} 2104.0 | ||
# HELP sglang:request_generation_tokens Number of generation tokens processed. | ||
# TYPE sglang:request_generation_tokens histogram | ||
sglang:request_generation_tokens_sum{name="google/gemma-2-9b-it"} 424529.0 | ||
sglang:request_generation_tokens_bucket{le="1.0",name="google/gemma-2-9b-it"} 0.0 | ||
sglang:request_generation_tokens_bucket{le="2.0",name="google/gemma-2-9b-it"} 0.0 | ||
sglang:request_generation_tokens_bucket{le="5.0",name="google/gemma-2-9b-it"} 49.0 | ||
sglang:request_generation_tokens_bucket{le="10.0",name="google/gemma-2-9b-it"} 202.0 | ||
sglang:request_generation_tokens_bucket{le="20.0",name="google/gemma-2-9b-it"} 448.0 | ||
sglang:request_generation_tokens_bucket{le="50.0",name="google/gemma-2-9b-it"} 814.0 | ||
sglang:request_generation_tokens_bucket{le="100.0",name="google/gemma-2-9b-it"} 979.0 | ||
sglang:request_generation_tokens_bucket{le="200.0",name="google/gemma-2-9b-it"} 1266.0 | ||
sglang:request_generation_tokens_bucket{le="500.0",name="google/gemma-2-9b-it"} 1883.0 | ||
sglang:request_generation_tokens_bucket{le="1000.0",name="google/gemma-2-9b-it"} 2095.0 | ||
sglang:request_generation_tokens_bucket{le="2000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_generation_tokens_bucket{le="5000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_generation_tokens_bucket{le="10000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_generation_tokens_bucket{le="20000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_generation_tokens_bucket{le="50000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_generation_tokens_bucket{le="100000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_generation_tokens_bucket{le="+Inf",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:request_generation_tokens_count{name="google/gemma-2-9b-it"} 2104.0 | ||
# HELP sglang:e2e_request_latency_seconds Histogram of End-to-end request latency in seconds | ||
# TYPE sglang:e2e_request_latency_seconds histogram | ||
sglang:e2e_request_latency_seconds_sum{name="google/gemma-2-9b-it"} 70517.99934530258 | ||
sglang:e2e_request_latency_seconds_bucket{le="1.0",name="google/gemma-2-9b-it"} 2.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="2.0",name="google/gemma-2-9b-it"} 21.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="5.0",name="google/gemma-2-9b-it"} 54.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="10.0",name="google/gemma-2-9b-it"} 311.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="20.0",name="google/gemma-2-9b-it"} 733.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="50.0",name="google/gemma-2-9b-it"} 1563.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="100.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="200.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="500.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="1000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="2000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="5000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="10000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="20000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="50000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="100000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:e2e_request_latency_seconds_bucket{le="+Inf",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:e2e_request_latency_seconds_count{name="google/gemma-2-9b-it"} 2104.0 | ||
# HELP sglang:waiting_request_latency_seconds Histogram of request waiting time in seconds | ||
# TYPE sglang:waiting_request_latency_seconds histogram | ||
sglang:waiting_request_latency_seconds_sum{name="google/gemma-2-9b-it"} 24885.007263183594 | ||
sglang:waiting_request_latency_seconds_bucket{le="1.0",name="google/gemma-2-9b-it"} 421.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="2.0",name="google/gemma-2-9b-it"} 563.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="5.0",name="google/gemma-2-9b-it"} 900.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="10.0",name="google/gemma-2-9b-it"} 1270.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="20.0",name="google/gemma-2-9b-it"} 1623.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="50.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="100.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="200.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="500.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="1000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="2000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="5000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="10000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="20000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="50000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="100000.0",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:waiting_request_latency_seconds_bucket{le="+Inf",name="google/gemma-2-9b-it"} 2104.0 | ||
sglang:waiting_request_latency_seconds_count{name="google/gemma-2-9b-it"} 2104.0 | ||
``` | ||
|
||
## Setup Guide | ||
|
||
To setup a monitoring dashboard, you can use the following docker compose file: [examples/monitoring/docker-compose.yaml](../examples/monitoring/docker-compose.yaml). | ||
|
||
Assume you have sglang server running at `localhost:30000`. | ||
|
||
To start the monitoring dashboard (prometheus + grafana), cd to `examples/monitoring` and run: | ||
|
||
```bash | ||
docker compose -f compose.yaml -p monitoring up | ||
``` | ||
|
||
Then you can access the Grafana dashboard at http://localhost:3000. | ||
|
||
### Grafana Dashboard | ||
|
||
To import the Grafana dashboard, click `+` -> `Import` -> `Upload JSON file` -> `Upload` and select [grafana.json](../examples/monitoring/grafana.json). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
services: | ||
prometheus: | ||
image: prom/prometheus:latest | ||
network_mode: host | ||
ports: | ||
- "9090:9090" | ||
volumes: | ||
- ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml | ||
|
||
grafana: | ||
image: grafana/grafana:latest | ||
network_mode: host | ||
depends_on: | ||
- prometheus | ||
ports: | ||
- "3000:3000" |
Oops, something went wrong.