From 0686a7c47d6933423ead476fda20433adb2eb14f Mon Sep 17 00:00:00 2001 From: Yingge He Date: Mon, 29 Jul 2024 16:03:15 -0700 Subject: [PATCH 01/34] Add first supported metrics --- src/metrics.py | 301 +++++++++++++++++++++++++++++++++++++++++++++++++ src/model.py | 7 ++ 2 files changed, 308 insertions(+) create mode 100644 src/metrics.py diff --git a/src/metrics.py b/src/metrics.py new file mode 100644 index 00000000..d7862133 --- /dev/null +++ b/src/metrics.py @@ -0,0 +1,301 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from typing import Dict, Union + +import triton_python_backend_utils as pb_utils +from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase +from vllm.engine.metrics import Stats as VllmStats +from vllm.engine.metrics import SupportsMetricsInfo + + +# begin-metrics-definitions +class TritonMetrics: + def __init__(self, labels): + # System stats + # Scheduler State + self.gauge_scheduler_running_family = pb_utils.MetricFamily( + name="vllm:num_requests_running", + description="Number of requests currently running on GPU.", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.gauge_scheduler_waiting_family = pb_utils.MetricFamily( + name="vllm:num_requests_waiting", + description="Number of requests waiting to be processed.", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.gauge_scheduler_swapped_family = pb_utils.MetricFamily( + name="vllm:num_requests_swapped", + description="Number of requests swapped to CPU.", + kind=pb_utils.MetricFamily.GAUGE, + ) + # KV Cache Usage in % + self.gauge_gpu_cache_usage_family = pb_utils.MetricFamily( + name="vllm:gpu_cache_usage_perc", + description="GPU KV-cache usage. 1 means 100 percent usage.", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.gauge_cpu_cache_usage_family = pb_utils.MetricFamily( + name="vllm:cpu_cache_usage_perc", + description="CPU KV-cache usage. 1 means 100 percent usage.", + kind=pb_utils.MetricFamily.GAUGE, + ) + + # Iteration stats + self.counter_num_preemption_family = pb_utils.MetricFamily( + name="vllm:num_preemptions_total", + description="Cumulative number of preemption from the engine.", + kind=pb_utils.MetricFamily.COUNTER, + ) + self.counter_prompt_tokens_family = pb_utils.MetricFamily( + name="vllm:prompt_tokens_total", + description="Number of prefill tokens processed.", + kind=pb_utils.MetricFamily.COUNTER, + ) + self.counter_generation_tokens_family = pb_utils.MetricFamily( + name="vllm:generation_tokens_total", + description="Number of generation tokens processed.", + kind=pb_utils.MetricFamily.COUNTER, + ) + # self.histogram_time_to_first_token_family = pb_utils.MetricFamily( + # name="vllm:time_to_first_token_seconds", + # description="Histogram of time to first token in seconds.", + # kind=pb_utils.MetricFamily.HISTOGRAM, + # buckets=[ + # 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + # 0.75, 1.0, 2.5, 5.0, 7.5, 10.0 + # ]) + # self.histogram_time_per_output_token_family = pb_utils.MetricFamily( + # name="vllm:time_per_output_token_seconds", + # description="Histogram of time per output token in seconds.", + # kind=pb_utils.MetricFamily.HISTOGRAM, + # buckets=[ + # 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + # 1.0, 2.5 + # ]) + + # Request stats + # Latency + # self.histogram_e2e_time_request_family = pb_utils.MetricFamily( + # name="vllm:e2e_request_latency_seconds", + # description="Histogram of end to end request latency in seconds.", + # kind=pb_utils.MetricFamily.HISTOGRAM, + # buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) + # # Metadata + # self.histogram_num_prompt_tokens_request_family = pb_utils.MetricFamily( + # name="vllm:request_prompt_tokens", + # description="Number of prefill tokens processed.", + # kind=pb_utils.MetricFamily.HISTOGRAM, + # buckets=build_1_2_5_buckets(max_model_len), + # ) + # self.histogram_num_generation_tokens_request_family = \ + # pb_utils.MetricFamily( + # name="vllm:request_generation_tokens", + # description="Number of generation tokens processed.", + # kind=pb_utils.MetricFamily.HISTOGRAM, + # buckets=build_1_2_5_buckets(max_model_len), + # ) + # self.histogram_best_of_request_family = pb_utils.MetricFamily( + # name="vllm:request_params_best_of", + # description="Histogram of the best_of request parameter.", + # kind=pb_utils.MetricFamily.HISTOGRAM, + # buckets=[1, 2, 5, 10, 20], + # ) + # self.histogram_n_request_family = pb_utils.MetricFamily( + # name="vllm:request_params_n", + # description="Histogram of the n request parameter.", + # kind=pb_utils.MetricFamily.HISTOGRAM, + # buckets=[1, 2, 5, 10, 20], + # ) + # self.counter_request_success_family = pb_utils.MetricFamily( + # name="vllm:request_success_total", + # description="Count of successfully processed requests.", + # kind=pb_utils.MetricFamily.COUNTER) + + # Speculatie decoding stats + # self.gauge_spec_decode_draft_acceptance_rate_family = pb_utils.MetricFamily( + # name="vllm:spec_decode_draft_acceptance_rate", + # description="Speculative token acceptance rate.", + # kind=pb_utils.MetricFamily.GAUGE) + # self.gauge_spec_decode_efficiency_family = pb_utils.MetricFamily( + # name="vllm:spec_decode_efficiency", + # description="Speculative decoding system efficiency.", + # kind=pb_utils.MetricFamily.GAUGE) + # self.counter_spec_decode_num_accepted_tokens_family = pb_utils.MetricFamily( + # name="vllm:spec_decode_num_accepted_tokens_total", + # description="Number of accepted tokens.", + # kind=pb_utils.MetricFamily.COUNTER) + # self.counter_spec_decode_num_draft_tokens_family = pb_utils.MetricFamily( + # name="vllm:spec_decode_num_draft_tokens_total", + # description="Number of draft tokens.", + # kind=pb_utils.MetricFamily.COUNTER) + # self.counter_spec_decode_num_emitted_tokens_family = pb_utils.MetricFamily( + # name="vllm:spec_decode_num_emitted_tokens_total", + # description="Number of emitted tokens.", + # kind=pb_utils.MetricFamily.COUNTER) + + # System stats + # Scheduler State + self.gauge_scheduler_running = self.gauge_scheduler_running_family.Metric( + labels=labels + ) + self.gauge_scheduler_waiting = self.gauge_scheduler_waiting_family.Metric( + labels=labels + ) + self.gauge_scheduler_swapped = self.gauge_scheduler_swapped_family.Metric( + labels=labels + ) + # KV Cache Usage in % + self.gauge_gpu_cache_usage = self.gauge_gpu_cache_usage_family.Metric( + labels=labels + ) + self.gauge_cpu_cache_usage = self.gauge_cpu_cache_usage_family.Metric( + labels=labels + ) + + # Iteration stats + self.counter_num_preemption = self.counter_num_preemption_family.Metric( + labels=labels + ) + self.counter_prompt_tokens = self.counter_prompt_tokens_family.Metric( + labels=labels + ) + self.counter_generation_tokens = self.counter_generation_tokens_family.Metric( + labels=labels + ) + # self.histogram_time_to_first_token = self.histogram_time_to_first_token_family.Metric( + # labels=labels + # ) + # self.histogram_time_per_output_token = self.histogram_time_per_output_token_family.Metric( + # labels=labels + # ) + + # Request stats + # Latency + # self.histogram_e2e_time_request = self.histogram_e2e_time_request_family.Metric( + # labels=labels + # ) + # # Metadata + # self.histogram_num_prompt_tokens_request = self.histogram_num_prompt_tokens_request_family.Metric( + # labels=labels + # ) + # self.histogram_num_generation_tokens_request = self.histogram_num_generation_tokens_request_family.Metric( + # labels=labels + # ) + # self.histogram_best_of_request = self.histogram_best_of_request_family.Metric( + # labels=labels + # ) + # self.histogram_n_request = self.histogram_n_request_family.Metric( + # labels=labels + # ) + # self.counter_request_success = self.counter_request_success_family.Metric( + # labels=labels + # ) + + # Speculatie decoding stats + # self.gauge_spec_decode_draft_acceptance_rate_ = self.gauge_spec_decode_draft_acceptance_rate_family.Metric( + # labels=labels + # ) + # self.gauge_spec_decode_efficiency = self.gauge_spec_decode_efficiency_family.Metric( + # labels=labels + # ) + # self.counter_spec_decode_num_accepted_tokens = self.counter_spec_decode_num_accepted_tokens_family.Metric( + # labels=labels + # ) + # self.counter_spec_decode_num_draft_tokens = self.counter_spec_decode_num_draft_tokens_family.Metric( + # labels=labels + # ) + # self.counter_spec_decode_num_emitted_tokens = self.counter_spec_decode_num_emitted_tokens_family.Metric( + # labels=labels + # ) + + +class VllmStatLogger(VllmStatLoggerBase): + """StatLoggeris used as adapter between vLLM stats collector and Triton metrics provider.""" + + # local_interval not used here. It's for vLLM logs to stdout. + def __init__(self, labels: Dict, local_interval: float = 0) -> None: + # Tracked stats over current local logging interval. + super().__init__(local_interval) + self.metrics = TritonMetrics(labels=labels) + + def info(self, type: str, obj: SupportsMetricsInfo) -> None: + raise NotImplementedError + + def _log_gauge(self, gauge, data: Union[int, float]) -> None: + # Convenience function for logging to gauge. + gauge.set(data) + + def _log_counter(self, counter, data: Union[int, float]) -> None: + # Convenience function for logging to counter. + counter.increment(data) + + # def _log_histogram(self, histogram, data: Union[List[int], + # List[float]]) -> None: + # # Convenience function for logging list to histogram. + # for datum in data: + # histogram.labels(**self.labels).observe(datum) + + def log(self, stats: VllmStats) -> None: + # self.maybe_update_spec_decode_metrics(stats) + + # System state data + self._log_gauge(self.metrics.gauge_scheduler_running, stats.num_running_sys) + self._log_gauge(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys) + self._log_gauge(self.metrics.gauge_scheduler_swapped, stats.num_swapped_sys) + self._log_gauge(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys) + self._log_gauge(self.metrics.gauge_cpu_cache_usage, stats.cpu_cache_usage_sys) + + # Iteration level data + self._log_counter( + self.metrics.counter_num_preemption, stats.num_preemption_iter + ) + self._log_counter( + self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter + ) + self._log_counter( + self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter + ) + # self._log_histogram(self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter) + # self._log_histogram(self.metrics.histogram_time_per_output_token, stats.time_per_output_tokens_iter) + + # Request level data + # Latency + # self._log_histogram(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests) + # Metadata + # self._log_histogram(self.metrics.histogram_num_prompt_tokens_request, stats.num_prompt_tokens_requests) + # self._log_histogram(self.metrics.histogram_num_generation_tokens_request, stats.num_generation_tokens_requests) + # self._log_histogram(self.metrics.histogram_best_of_request, stats.best_of_requests) + # self._log_histogram(self.metrics.histogram_n_request, stats.n_requests) + # self._log_histogram(self.metrics.counter_request_success, stats.finished_reason_requests) + + # Speculatie decoding stats + # if self.spec_decode_metrics is not None: + # self._log_gauge(self.metrics.gauge_spec_decode_draft_acceptance_rate, self.spec_decode_metrics.draft_acceptance_rate) + # self._log_gauge(self.metrics.gauge_spec_decode_efficiency, self.spec_decode_metrics.system_efficiency) + # self._log_counter(self.metrics.counter_spec_decode_num_accepted_tokens, self.spec_decode_metrics.accepted_tokens) + # self._log_counter(self.metrics.counter_spec_decode_num_draft_tokens, self.spec_decode_metrics.draft_tokens) + # self._log_counter(self.metrics.counter_spec_decode_num_emitted_tokens, self.spec_decode_metrics.emitted_tokens) diff --git a/src/model.py b/src/model.py index 3fe7cd1e..4c86dfdf 100644 --- a/src/model.py +++ b/src/model.py @@ -39,6 +39,8 @@ from vllm.sampling_params import SamplingParams from vllm.utils import random_uuid +from metrics import VllmStatLogger + _VLLM_ENGINE_ARGS_FILENAME = "model.json" _MULTI_LORA_ARGS_FILENAME = "multi_lora.json" @@ -151,6 +153,11 @@ def init_engine(self): AsyncEngineArgs(**self.vllm_engine_config) ) + # Create vLLM custom Metrics + labels = {"model": "vllm_metrics", "version": "1"} + logger = VllmStatLogger(vllm_labels=labels) + self.llm_engine.add_logger("triton", logger) + def setup_lora(self): self.enable_lora = False From 21e235660920dc93cd0e9462ee217008c5ce25de Mon Sep 17 00:00:00 2001 From: Yingge He Date: Tue, 30 Jul 2024 11:14:47 -0700 Subject: [PATCH 02/34] Update comments --- src/metrics.py | 2 +- src/model.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/metrics.py b/src/metrics.py index d7862133..55a7807d 100644 --- a/src/metrics.py +++ b/src/metrics.py @@ -234,7 +234,7 @@ def __init__(self, labels): class VllmStatLogger(VllmStatLoggerBase): - """StatLoggeris used as adapter between vLLM stats collector and Triton metrics provider.""" + """StatLogger is used as an adapter between vLLM stats collector and Triton metrics provider.""" # local_interval not used here. It's for vLLM logs to stdout. def __init__(self, labels: Dict, local_interval: float = 0) -> None: diff --git a/src/model.py b/src/model.py index 4c86dfdf..89073979 100644 --- a/src/model.py +++ b/src/model.py @@ -154,8 +154,11 @@ def init_engine(self): ) # Create vLLM custom Metrics - labels = {"model": "vllm_metrics", "version": "1"} - logger = VllmStatLogger(vllm_labels=labels) + labels = { + "model": self.args["model_name"], + "version": self.args["model_version"], + } + logger = VllmStatLogger(labels=labels) self.llm_engine.add_logger("triton", logger) def setup_lora(self): From d95bb2c349ef79caeb8131b0e8c802a1b84c35da Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 1 Aug 2024 16:18:10 -0700 Subject: [PATCH 03/34] Minor update --- src/metrics.py | 155 ++----------------------------------------------- 1 file changed, 5 insertions(+), 150 deletions(-) diff --git a/src/metrics.py b/src/metrics.py index 55a7807d..640545ed 100644 --- a/src/metrics.py +++ b/src/metrics.py @@ -24,7 +24,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from typing import Dict, Union +from typing import Dict, List, Union import triton_python_backend_utils as pb_utils from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase @@ -32,7 +32,6 @@ from vllm.engine.metrics import SupportsMetricsInfo -# begin-metrics-definitions class TritonMetrics: def __init__(self, labels): # System stats @@ -80,82 +79,6 @@ def __init__(self, labels): description="Number of generation tokens processed.", kind=pb_utils.MetricFamily.COUNTER, ) - # self.histogram_time_to_first_token_family = pb_utils.MetricFamily( - # name="vllm:time_to_first_token_seconds", - # description="Histogram of time to first token in seconds.", - # kind=pb_utils.MetricFamily.HISTOGRAM, - # buckets=[ - # 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, - # 0.75, 1.0, 2.5, 5.0, 7.5, 10.0 - # ]) - # self.histogram_time_per_output_token_family = pb_utils.MetricFamily( - # name="vllm:time_per_output_token_seconds", - # description="Histogram of time per output token in seconds.", - # kind=pb_utils.MetricFamily.HISTOGRAM, - # buckets=[ - # 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, - # 1.0, 2.5 - # ]) - - # Request stats - # Latency - # self.histogram_e2e_time_request_family = pb_utils.MetricFamily( - # name="vllm:e2e_request_latency_seconds", - # description="Histogram of end to end request latency in seconds.", - # kind=pb_utils.MetricFamily.HISTOGRAM, - # buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) - # # Metadata - # self.histogram_num_prompt_tokens_request_family = pb_utils.MetricFamily( - # name="vllm:request_prompt_tokens", - # description="Number of prefill tokens processed.", - # kind=pb_utils.MetricFamily.HISTOGRAM, - # buckets=build_1_2_5_buckets(max_model_len), - # ) - # self.histogram_num_generation_tokens_request_family = \ - # pb_utils.MetricFamily( - # name="vllm:request_generation_tokens", - # description="Number of generation tokens processed.", - # kind=pb_utils.MetricFamily.HISTOGRAM, - # buckets=build_1_2_5_buckets(max_model_len), - # ) - # self.histogram_best_of_request_family = pb_utils.MetricFamily( - # name="vllm:request_params_best_of", - # description="Histogram of the best_of request parameter.", - # kind=pb_utils.MetricFamily.HISTOGRAM, - # buckets=[1, 2, 5, 10, 20], - # ) - # self.histogram_n_request_family = pb_utils.MetricFamily( - # name="vllm:request_params_n", - # description="Histogram of the n request parameter.", - # kind=pb_utils.MetricFamily.HISTOGRAM, - # buckets=[1, 2, 5, 10, 20], - # ) - # self.counter_request_success_family = pb_utils.MetricFamily( - # name="vllm:request_success_total", - # description="Count of successfully processed requests.", - # kind=pb_utils.MetricFamily.COUNTER) - - # Speculatie decoding stats - # self.gauge_spec_decode_draft_acceptance_rate_family = pb_utils.MetricFamily( - # name="vllm:spec_decode_draft_acceptance_rate", - # description="Speculative token acceptance rate.", - # kind=pb_utils.MetricFamily.GAUGE) - # self.gauge_spec_decode_efficiency_family = pb_utils.MetricFamily( - # name="vllm:spec_decode_efficiency", - # description="Speculative decoding system efficiency.", - # kind=pb_utils.MetricFamily.GAUGE) - # self.counter_spec_decode_num_accepted_tokens_family = pb_utils.MetricFamily( - # name="vllm:spec_decode_num_accepted_tokens_total", - # description="Number of accepted tokens.", - # kind=pb_utils.MetricFamily.COUNTER) - # self.counter_spec_decode_num_draft_tokens_family = pb_utils.MetricFamily( - # name="vllm:spec_decode_num_draft_tokens_total", - # description="Number of draft tokens.", - # kind=pb_utils.MetricFamily.COUNTER) - # self.counter_spec_decode_num_emitted_tokens_family = pb_utils.MetricFamily( - # name="vllm:spec_decode_num_emitted_tokens_total", - # description="Number of emitted tokens.", - # kind=pb_utils.MetricFamily.COUNTER) # System stats # Scheduler State @@ -186,51 +109,6 @@ def __init__(self, labels): self.counter_generation_tokens = self.counter_generation_tokens_family.Metric( labels=labels ) - # self.histogram_time_to_first_token = self.histogram_time_to_first_token_family.Metric( - # labels=labels - # ) - # self.histogram_time_per_output_token = self.histogram_time_per_output_token_family.Metric( - # labels=labels - # ) - - # Request stats - # Latency - # self.histogram_e2e_time_request = self.histogram_e2e_time_request_family.Metric( - # labels=labels - # ) - # # Metadata - # self.histogram_num_prompt_tokens_request = self.histogram_num_prompt_tokens_request_family.Metric( - # labels=labels - # ) - # self.histogram_num_generation_tokens_request = self.histogram_num_generation_tokens_request_family.Metric( - # labels=labels - # ) - # self.histogram_best_of_request = self.histogram_best_of_request_family.Metric( - # labels=labels - # ) - # self.histogram_n_request = self.histogram_n_request_family.Metric( - # labels=labels - # ) - # self.counter_request_success = self.counter_request_success_family.Metric( - # labels=labels - # ) - - # Speculatie decoding stats - # self.gauge_spec_decode_draft_acceptance_rate_ = self.gauge_spec_decode_draft_acceptance_rate_family.Metric( - # labels=labels - # ) - # self.gauge_spec_decode_efficiency = self.gauge_spec_decode_efficiency_family.Metric( - # labels=labels - # ) - # self.counter_spec_decode_num_accepted_tokens = self.counter_spec_decode_num_accepted_tokens_family.Metric( - # labels=labels - # ) - # self.counter_spec_decode_num_draft_tokens = self.counter_spec_decode_num_draft_tokens_family.Metric( - # labels=labels - # ) - # self.counter_spec_decode_num_emitted_tokens = self.counter_spec_decode_num_emitted_tokens_family.Metric( - # labels=labels - # ) class VllmStatLogger(VllmStatLoggerBase): @@ -253,15 +131,12 @@ def _log_counter(self, counter, data: Union[int, float]) -> None: # Convenience function for logging to counter. counter.increment(data) - # def _log_histogram(self, histogram, data: Union[List[int], - # List[float]]) -> None: - # # Convenience function for logging list to histogram. - # for datum in data: - # histogram.labels(**self.labels).observe(datum) + def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None: + # Convenience function for logging list to histogram. + for datum in data: + histogram.observe(datum) def log(self, stats: VllmStats) -> None: - # self.maybe_update_spec_decode_metrics(stats) - # System state data self._log_gauge(self.metrics.gauge_scheduler_running, stats.num_running_sys) self._log_gauge(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys) @@ -279,23 +154,3 @@ def log(self, stats: VllmStats) -> None: self._log_counter( self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter ) - # self._log_histogram(self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter) - # self._log_histogram(self.metrics.histogram_time_per_output_token, stats.time_per_output_tokens_iter) - - # Request level data - # Latency - # self._log_histogram(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests) - # Metadata - # self._log_histogram(self.metrics.histogram_num_prompt_tokens_request, stats.num_prompt_tokens_requests) - # self._log_histogram(self.metrics.histogram_num_generation_tokens_request, stats.num_generation_tokens_requests) - # self._log_histogram(self.metrics.histogram_best_of_request, stats.best_of_requests) - # self._log_histogram(self.metrics.histogram_n_request, stats.n_requests) - # self._log_histogram(self.metrics.counter_request_success, stats.finished_reason_requests) - - # Speculatie decoding stats - # if self.spec_decode_metrics is not None: - # self._log_gauge(self.metrics.gauge_spec_decode_draft_acceptance_rate, self.spec_decode_metrics.draft_acceptance_rate) - # self._log_gauge(self.metrics.gauge_spec_decode_efficiency, self.spec_decode_metrics.system_efficiency) - # self._log_counter(self.metrics.counter_spec_decode_num_accepted_tokens, self.spec_decode_metrics.accepted_tokens) - # self._log_counter(self.metrics.counter_spec_decode_num_draft_tokens, self.spec_decode_metrics.draft_tokens) - # self._log_counter(self.metrics.counter_spec_decode_num_emitted_tokens, self.spec_decode_metrics.emitted_tokens) From 321faa070a3a3cd3742546ec4a956913072f3df0 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 2 Aug 2024 18:29:51 -0700 Subject: [PATCH 04/34] Add metrics test --- README.md | 4 +- ci/L0_backend_vllm/metrics_test/test.sh | 98 ++++++++++ .../metrics_test/vllm_metrics_test.py | 171 ++++++++++++++++++ ci/L0_backend_vllm/test.sh | 2 +- src/model.py | 2 +- src/{ => utils}/metrics.py | 0 6 files changed, 274 insertions(+), 3 deletions(-) create mode 100755 ci/L0_backend_vllm/metrics_test/test.sh create mode 100644 ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py rename src/{ => utils}/metrics.py (100%) diff --git a/README.md b/README.md index 13953f58..802f4f4c 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,9 @@ container with the following commands: ``` mkdir -p /opt/tritonserver/backends/vllm -wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/main/src/model.py +git clone https://github.com/triton-inference-server/vllm_backend.git /opt/tritonserver/backends/vllm/vllm_backend +cp -r /opt/tritonserver/backends/vllm/vllm_backend/src/* /opt/tritonserver/backends/vllm +rm -rf /opt/tritonserver/backends/vllm/vllm_backend ``` ## Using the vLLM Backend diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh new file mode 100755 index 00000000..fc109b9b --- /dev/null +++ b/ci/L0_backend_vllm/metrics_test/test.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +source ../../common/util.sh + +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +SERVER=${TRITON_DIR}/bin/tritonserver +BACKEND_DIR=${TRITON_DIR}/backends +SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_opt --log-verbose=1" +SERVER_LOG="./vllm_metrics_server.log" +CLIENT_LOG="./vllm_metrics_client.log" +TEST_RESULT_FILE='test_results.txt' +CLIENT_PY="./vllm_metrics_test.py" +SAMPLE_MODELS_REPO="../../../samples/model_repository" +EXPECTED_NUM_TESTS=1 + +# Helpers ======================================= +function assert_curl_success { + message="${1}" + if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***" + RET=1 + fi +} + +rm -rf models && mkdir -p models +cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt +# `vllm_opt`` model will be loaded on server start and stay loaded throughout +# unittesting. To ensure that vllm's memory profiler will not error out +# on `vllm_load_test` load, we reduce "gpu_memory_utilization" for `vllm_opt`, +# so that at least 60% of GPU memory was available for other models. +sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json + +RET=0 + +run_server +if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to start $SERVER\n***" + exit 1 +fi + +set +e +python3 $CLIENT_PY -v > $CLIENT_LOG 2>&1 + +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Running $CLIENT_PY FAILED. \n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification FAILED.\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID +rm -rf "./models" + +if [ $RET -eq 1 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** vLLM test FAILED. \n***" +else + echo -e "\n***\n*** vLLM test PASSED. \n***" +fi + +collect_artifacts_from_subdir +exit $RET diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py new file mode 100644 index 00000000..5569969c --- /dev/null +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -0,0 +1,171 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import re +import sys +import unittest +from functools import partial + +import requests +import tritonclient.grpc as grpcclient +from tritonclient.utils import * + +sys.path.append("../../common") +from test_util import TestResultCollector, UserData, callback, create_vllm_request + +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + +PROMPTS = [ + "The most dangerous animal is", + "The capital of France is", + "The future of AI is", +] +SAMPLING_PARAMETERS = {"temperature": "0", "top_p": "1"} + + +def get_metrics(): + """ + Store vllm metrics in a dictionary. + """ + r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics") + r.raise_for_status() + + # Regular expression to match the pattern + pattern = r"^(vllm:.*){.*} (\d+)$" + vllm_dict = {} + + # Find all matches in the text + matches = re.findall(pattern, r.text, re.MULTILINE) + + for match in matches: + key, value = match + vllm_dict[key] = int(value) + + return vllm_dict + + +class VLLMTritonMetricsTest(TestResultCollector): + def setUp(self): + self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001") + self.vllm_model_name = "vllm_opt" + + def test_vllm_metrics(self): + # Supported vLLM metrics + expected_metrics_dict = { + "vllm:num_requests_running": 0, + "vllm:num_requests_waiting": 0, + "vllm:num_requests_swapped": 0, + "vllm:gpu_cache_usage_perc": 0, + "vllm:cpu_cache_usage_perc": 0, + "vllm:num_preemptions_total": 0, + "vllm:prompt_tokens_total": 0, + "vllm:generation_tokens_total": 0, + } + + # Test vLLM metrics + self._test_vllm_model( + prompts=PROMPTS, + sampling_parameters=SAMPLING_PARAMETERS, + stream=False, + send_parameters_as_tensor=True, + model_name=self.vllm_model_name, + ) + expected_metrics_dict["vllm:prompt_tokens_total"] = 18 + expected_metrics_dict["vllm:generation_tokens_total"] = 48 + print(get_metrics()) + print(expected_metrics_dict) + self.assertEqual(get_metrics(), expected_metrics_dict) + + self._test_vllm_model( + prompts=PROMPTS, + sampling_parameters=SAMPLING_PARAMETERS, + stream=False, + send_parameters_as_tensor=False, + model_name=self.vllm_model_name, + ) + expected_metrics_dict["vllm:prompt_tokens_total"] = 36 + expected_metrics_dict["vllm:generation_tokens_total"] = 96 + self.assertEqual(get_metrics(), expected_metrics_dict) + + def _test_vllm_model( + self, + prompts, + sampling_parameters, + stream, + send_parameters_as_tensor, + exclude_input_in_output=None, + expected_output=None, + model_name="vllm_opt", + ): + user_data = UserData() + number_of_vllm_reqs = len(prompts) + + self.triton_client.start_stream(callback=partial(callback, user_data)) + for i in range(number_of_vllm_reqs): + request_data = create_vllm_request( + prompts[i], + i, + stream, + sampling_parameters, + model_name, + send_parameters_as_tensor, + exclude_input_in_output=exclude_input_in_output, + ) + self.triton_client.async_stream_infer( + model_name=model_name, + request_id=request_data["request_id"], + inputs=request_data["inputs"], + outputs=request_data["outputs"], + parameters=sampling_parameters, + ) + + for i in range(number_of_vllm_reqs): + result = user_data._completed_requests.get() + if type(result) is InferenceServerException: + print(result.message()) + self.assertIsNot(type(result), InferenceServerException, str(result)) + + output = result.as_numpy("text_output") + self.assertIsNotNone(output, "`text_output` should not be None") + if expected_output is not None: + self.assertEqual( + output, + expected_output[i], + 'Actual and expected outputs do not match.\n \ + Expected "{}" \n Actual:"{}"'.format( + output, expected_output[i] + ), + ) + + self.triton_client.stop_stream() + + def tearDown(self): + self.triton_client.close() + + +if __name__ == "__main__": + unittest.main() diff --git a/ci/L0_backend_vllm/test.sh b/ci/L0_backend_vllm/test.sh index 93d065c8..a9f89894 100755 --- a/ci/L0_backend_vllm/test.sh +++ b/ci/L0_backend_vllm/test.sh @@ -26,7 +26,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RET=0 -SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend" +SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend metrics_test" python3 -m pip install --upgrade pip && pip3 install tritonclient[grpc] diff --git a/src/model.py b/src/model.py index 89073979..f250f86c 100644 --- a/src/model.py +++ b/src/model.py @@ -39,7 +39,7 @@ from vllm.sampling_params import SamplingParams from vllm.utils import random_uuid -from metrics import VllmStatLogger +from utils.metrics import VllmStatLogger _VLLM_ENGINE_ARGS_FILENAME = "model.json" _MULTI_LORA_ARGS_FILENAME = "multi_lora.json" diff --git a/src/metrics.py b/src/utils/metrics.py similarity index 100% rename from src/metrics.py rename to src/utils/metrics.py From 468539ff649cd6b7d282a25389e30062239f9387 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Mon, 5 Aug 2024 11:45:11 -0700 Subject: [PATCH 05/34] Fix copyright --- ci/L0_backend_vllm/metrics_test/test.sh | 2 +- ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh index fc109b9b..6509b13c 100755 --- a/ci/L0_backend_vllm/metrics_test/test.sh +++ b/ci/L0_backend_vllm/metrics_test/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index 5569969c..0e0477a4 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -1,4 +1,4 @@ -# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions From 8eba2f034cff5ad1aaab3d78ef0b5e943bee431f Mon Sep 17 00:00:00 2001 From: Yingge He Date: Tue, 6 Aug 2024 12:04:12 -0700 Subject: [PATCH 06/34] Remove unused metrics and update comments --- .../metrics_test/vllm_metrics_test.py | 75 ++++++------- src/utils/metrics.py | 102 +++++------------- 2 files changed, 60 insertions(+), 117 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index 0e0477a4..ed231f32 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -37,79 +37,68 @@ sys.path.append("../../common") from test_util import TestResultCollector, UserData, callback, create_vllm_request -_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") -PROMPTS = [ - "The most dangerous animal is", - "The capital of France is", - "The future of AI is", -] -SAMPLING_PARAMETERS = {"temperature": "0", "top_p": "1"} - - -def get_metrics(): - """ - Store vllm metrics in a dictionary. - """ - r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics") - r.raise_for_status() - - # Regular expression to match the pattern - pattern = r"^(vllm:.*){.*} (\d+)$" - vllm_dict = {} +class VLLMTritonMetricsTest(TestResultCollector): + def setUp(self): + self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001") + self.tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + self.vllm_model_name = "vllm_opt" + self.prompts = [ + "The most dangerous animal is", + "The capital of France is", + "The future of AI is", + ] + self.sampling_parameters = {"temperature": "0", "top_p": "1"} - # Find all matches in the text - matches = re.findall(pattern, r.text, re.MULTILINE) + def get_metrics(self): + """ + Store vllm metrics in a dictionary. + """ + r = requests.get(f"http://{self.tritonserver_ipaddr}:8002/metrics") + r.raise_for_status() - for match in matches: - key, value = match - vllm_dict[key] = int(value) + # Regular expression to match the pattern + pattern = r"^(vllm:.*){.*} (\d+)$" + vllm_dict = {} - return vllm_dict + # Find all matches in the text + matches = re.findall(pattern, r.text, re.MULTILINE) + for match in matches: + key, value = match + vllm_dict[key] = int(value) -class VLLMTritonMetricsTest(TestResultCollector): - def setUp(self): - self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001") - self.vllm_model_name = "vllm_opt" + return vllm_dict def test_vllm_metrics(self): # Supported vLLM metrics expected_metrics_dict = { - "vllm:num_requests_running": 0, - "vllm:num_requests_waiting": 0, - "vllm:num_requests_swapped": 0, - "vllm:gpu_cache_usage_perc": 0, - "vllm:cpu_cache_usage_perc": 0, - "vllm:num_preemptions_total": 0, "vllm:prompt_tokens_total": 0, "vllm:generation_tokens_total": 0, } # Test vLLM metrics self._test_vllm_model( - prompts=PROMPTS, - sampling_parameters=SAMPLING_PARAMETERS, + prompts=self.prompts, + sampling_parameters=self.sampling_parameters, stream=False, send_parameters_as_tensor=True, model_name=self.vllm_model_name, ) expected_metrics_dict["vllm:prompt_tokens_total"] = 18 expected_metrics_dict["vllm:generation_tokens_total"] = 48 - print(get_metrics()) - print(expected_metrics_dict) - self.assertEqual(get_metrics(), expected_metrics_dict) + self.assertEqual(self.get_metrics(), expected_metrics_dict) self._test_vllm_model( - prompts=PROMPTS, - sampling_parameters=SAMPLING_PARAMETERS, + prompts=self.prompts, + sampling_parameters=self.sampling_parameters, stream=False, send_parameters_as_tensor=False, model_name=self.vllm_model_name, ) expected_metrics_dict["vllm:prompt_tokens_total"] = 36 expected_metrics_dict["vllm:generation_tokens_total"] = 96 - self.assertEqual(get_metrics(), expected_metrics_dict) + self.assertEqual(self.get_metrics(), expected_metrics_dict) def _test_vllm_model( self, diff --git a/src/utils/metrics.py b/src/utils/metrics.py index 640545ed..ff78ddf4 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -34,41 +34,8 @@ class TritonMetrics: def __init__(self, labels): - # System stats - # Scheduler State - self.gauge_scheduler_running_family = pb_utils.MetricFamily( - name="vllm:num_requests_running", - description="Number of requests currently running on GPU.", - kind=pb_utils.MetricFamily.GAUGE, - ) - self.gauge_scheduler_waiting_family = pb_utils.MetricFamily( - name="vllm:num_requests_waiting", - description="Number of requests waiting to be processed.", - kind=pb_utils.MetricFamily.GAUGE, - ) - self.gauge_scheduler_swapped_family = pb_utils.MetricFamily( - name="vllm:num_requests_swapped", - description="Number of requests swapped to CPU.", - kind=pb_utils.MetricFamily.GAUGE, - ) - # KV Cache Usage in % - self.gauge_gpu_cache_usage_family = pb_utils.MetricFamily( - name="vllm:gpu_cache_usage_perc", - description="GPU KV-cache usage. 1 means 100 percent usage.", - kind=pb_utils.MetricFamily.GAUGE, - ) - self.gauge_cpu_cache_usage_family = pb_utils.MetricFamily( - name="vllm:cpu_cache_usage_perc", - description="CPU KV-cache usage. 1 means 100 percent usage.", - kind=pb_utils.MetricFamily.GAUGE, - ) - + # Initialize metric families # Iteration stats - self.counter_num_preemption_family = pb_utils.MetricFamily( - name="vllm:num_preemptions_total", - description="Cumulative number of preemption from the engine.", - kind=pb_utils.MetricFamily.COUNTER, - ) self.counter_prompt_tokens_family = pb_utils.MetricFamily( name="vllm:prompt_tokens_total", description="Number of prefill tokens processed.", @@ -80,29 +47,8 @@ def __init__(self, labels): kind=pb_utils.MetricFamily.COUNTER, ) - # System stats - # Scheduler State - self.gauge_scheduler_running = self.gauge_scheduler_running_family.Metric( - labels=labels - ) - self.gauge_scheduler_waiting = self.gauge_scheduler_waiting_family.Metric( - labels=labels - ) - self.gauge_scheduler_swapped = self.gauge_scheduler_swapped_family.Metric( - labels=labels - ) - # KV Cache Usage in % - self.gauge_gpu_cache_usage = self.gauge_gpu_cache_usage_family.Metric( - labels=labels - ) - self.gauge_cpu_cache_usage = self.gauge_cpu_cache_usage_family.Metric( - labels=labels - ) - + # Initialize metrics # Iteration stats - self.counter_num_preemption = self.counter_num_preemption_family.Metric( - labels=labels - ) self.counter_prompt_tokens = self.counter_prompt_tokens_family.Metric( labels=labels ) @@ -124,30 +70,38 @@ def info(self, type: str, obj: SupportsMetricsInfo) -> None: raise NotImplementedError def _log_gauge(self, gauge, data: Union[int, float]) -> None: - # Convenience function for logging to gauge. + """Convenience function for logging to gauge. + + Args: + gauge: A gauge metric instance. + data: An int or float to set the gauge metric. + + Returns: + None + """ gauge.set(data) def _log_counter(self, counter, data: Union[int, float]) -> None: - # Convenience function for logging to counter. - counter.increment(data) + """Convenience function for logging to counter. + + Args: + counter: A counter metric instance. + data: An int or float to increment the count metric. - def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None: - # Convenience function for logging list to histogram. - for datum in data: - histogram.observe(datum) + Returns: + None + """ + counter.increment(data) def log(self, stats: VllmStats) -> None: - # System state data - self._log_gauge(self.metrics.gauge_scheduler_running, stats.num_running_sys) - self._log_gauge(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys) - self._log_gauge(self.metrics.gauge_scheduler_swapped, stats.num_swapped_sys) - self._log_gauge(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys) - self._log_gauge(self.metrics.gauge_cpu_cache_usage, stats.cpu_cache_usage_sys) - - # Iteration level data - self._log_counter( - self.metrics.counter_num_preemption, stats.num_preemption_iter - ) + """Logs tracked stats to triton metrics server every iteration. + + Args: + stats: Created by LLMEngine for use by VllmStatLogger. + + Returns: + None + """ self._log_counter( self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter ) From 6f97f6f9097bb8be59e9882ba442c13994aeefc9 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Tue, 6 Aug 2024 15:15:20 -0700 Subject: [PATCH 07/34] Minor update --- ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py | 4 ++-- src/utils/metrics.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index ed231f32..9c69ccab 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -58,7 +58,7 @@ def get_metrics(self): r.raise_for_status() # Regular expression to match the pattern - pattern = r"^(vllm:.*){.*} (\d+)$" + pattern = r"^(vllm:[^ {]+)(?:{.*})? ([0-9.-]+)$" vllm_dict = {} # Find all matches in the text @@ -71,7 +71,7 @@ def get_metrics(self): return vllm_dict def test_vllm_metrics(self): - # Supported vLLM metrics + # All vLLM metrics from tritonserver expected_metrics_dict = { "vllm:prompt_tokens_total": 0, "vllm:generation_tokens_total": 0, diff --git a/src/utils/metrics.py b/src/utils/metrics.py index ff78ddf4..24ce4eae 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -91,7 +91,8 @@ def _log_counter(self, counter, data: Union[int, float]) -> None: Returns: None """ - counter.increment(data) + if data != 0: + counter.increment(data) def log(self, stats: VllmStats) -> None: """Logs tracked stats to triton metrics server every iteration. From bf7669ee9efb32566642cb6286940a6e0642f647 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Tue, 6 Aug 2024 15:32:58 -0700 Subject: [PATCH 08/34] Minor updates --- .../metrics_test/vllm_metrics_test.py | 81 +++++++++---------- 1 file changed, 36 insertions(+), 45 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index 9c69ccab..56fb009c 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -70,46 +70,17 @@ def get_metrics(self): return vllm_dict - def test_vllm_metrics(self): - # All vLLM metrics from tritonserver - expected_metrics_dict = { - "vllm:prompt_tokens_total": 0, - "vllm:generation_tokens_total": 0, - } - - # Test vLLM metrics - self._test_vllm_model( - prompts=self.prompts, - sampling_parameters=self.sampling_parameters, - stream=False, - send_parameters_as_tensor=True, - model_name=self.vllm_model_name, - ) - expected_metrics_dict["vllm:prompt_tokens_total"] = 18 - expected_metrics_dict["vllm:generation_tokens_total"] = 48 - self.assertEqual(self.get_metrics(), expected_metrics_dict) - - self._test_vllm_model( - prompts=self.prompts, - sampling_parameters=self.sampling_parameters, - stream=False, - send_parameters_as_tensor=False, - model_name=self.vllm_model_name, - ) - expected_metrics_dict["vllm:prompt_tokens_total"] = 36 - expected_metrics_dict["vllm:generation_tokens_total"] = 96 - self.assertEqual(self.get_metrics(), expected_metrics_dict) - - def _test_vllm_model( + def vllm_async_stream_infer( self, prompts, sampling_parameters, stream, send_parameters_as_tensor, - exclude_input_in_output=None, - expected_output=None, - model_name="vllm_opt", + model_name, ): + """ + Helper function to send async stream infer requests to vLLM. + """ user_data = UserData() number_of_vllm_reqs = len(prompts) @@ -122,7 +93,6 @@ def _test_vllm_model( sampling_parameters, model_name, send_parameters_as_tensor, - exclude_input_in_output=exclude_input_in_output, ) self.triton_client.async_stream_infer( model_name=model_name, @@ -132,7 +102,7 @@ def _test_vllm_model( parameters=sampling_parameters, ) - for i in range(number_of_vllm_reqs): + for _ in range(number_of_vllm_reqs): result = user_data._completed_requests.get() if type(result) is InferenceServerException: print(result.message()) @@ -140,18 +110,39 @@ def _test_vllm_model( output = result.as_numpy("text_output") self.assertIsNotNone(output, "`text_output` should not be None") - if expected_output is not None: - self.assertEqual( - output, - expected_output[i], - 'Actual and expected outputs do not match.\n \ - Expected "{}" \n Actual:"{}"'.format( - output, expected_output[i] - ), - ) self.triton_client.stop_stream() + def test_vllm_metrics(self): + # All vLLM metrics from tritonserver + expected_metrics_dict = { + "vllm:prompt_tokens_total": 0, + "vllm:generation_tokens_total": 0, + } + + # Test vLLM metrics + self.vllm_async_stream_infer( + prompts=self.prompts, + sampling_parameters=self.sampling_parameters, + stream=False, + send_parameters_as_tensor=True, + model_name=self.vllm_model_name, + ) + expected_metrics_dict["vllm:prompt_tokens_total"] = 18 + expected_metrics_dict["vllm:generation_tokens_total"] = 48 + self.assertEqual(self.get_metrics(), expected_metrics_dict) + + self.vllm_async_stream_infer( + prompts=self.prompts, + sampling_parameters=self.sampling_parameters, + stream=False, + send_parameters_as_tensor=False, + model_name=self.vllm_model_name, + ) + expected_metrics_dict["vllm:prompt_tokens_total"] = 36 + expected_metrics_dict["vllm:generation_tokens_total"] = 96 + self.assertEqual(self.get_metrics(), expected_metrics_dict) + def tearDown(self): self.triton_client.close() From e9d0dbbfbfc1951a9b2e494a073b0a93fd345449 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Tue, 6 Aug 2024 21:55:09 -0700 Subject: [PATCH 09/34] Minor fix --- .../metrics_test/vllm_metrics_test.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index 56fb009c..ad0cf990 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -66,7 +66,7 @@ def get_metrics(self): for match in matches: key, value = match - vllm_dict[key] = int(value) + vllm_dict[key] = float(value) if "." in value else int(value) return vllm_dict @@ -132,17 +132,6 @@ def test_vllm_metrics(self): expected_metrics_dict["vllm:generation_tokens_total"] = 48 self.assertEqual(self.get_metrics(), expected_metrics_dict) - self.vllm_async_stream_infer( - prompts=self.prompts, - sampling_parameters=self.sampling_parameters, - stream=False, - send_parameters_as_tensor=False, - model_name=self.vllm_model_name, - ) - expected_metrics_dict["vllm:prompt_tokens_total"] = 36 - expected_metrics_dict["vllm:generation_tokens_total"] = 96 - self.assertEqual(self.get_metrics(), expected_metrics_dict) - def tearDown(self): self.triton_client.close() From 7d0dc5bf3884379522300362ee259d73420fcfe5 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Wed, 7 Aug 2024 11:48:46 -0700 Subject: [PATCH 10/34] Remove unused module --- src/utils/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/metrics.py b/src/utils/metrics.py index 24ce4eae..e8c58372 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -24,7 +24,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from typing import Dict, List, Union +from typing import Dict, Union import triton_python_backend_utils as pb_utils from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase From 979dc02bc979362f7f28a39b0cd75320fb905b88 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Wed, 7 Aug 2024 17:06:51 -0700 Subject: [PATCH 11/34] Fix "metrics not supported error" when building with TRITON_ENABLE_METRICS=OFF flag --- src/model.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/model.py b/src/model.py index f250f86c..76dac922 100644 --- a/src/model.py +++ b/src/model.py @@ -153,13 +153,15 @@ def init_engine(self): AsyncEngineArgs(**self.vllm_engine_config) ) - # Create vLLM custom Metrics - labels = { - "model": self.args["model_name"], - "version": self.args["model_version"], - } - logger = VllmStatLogger(labels=labels) - self.llm_engine.add_logger("triton", logger) + # If TRITON_ENABLE_METRICS<_CPU/_GPU> build flag is enabled. + if self.args["metrics_mode"] in ["all", "cpu", "gpu"]: + # Create vLLM custom Metrics + labels = { + "model": self.args["model_name"], + "version": self.args["model_version"], + } + logger = VllmStatLogger(labels=labels) + self.llm_engine.add_logger("triton", logger) def setup_lora(self): self.enable_lora = False From 3dd04c5373bb9b37642c61923e799df0bff14e65 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 8 Aug 2024 15:54:53 -0700 Subject: [PATCH 12/34] Fix "metrics not supported error" when building with TRITON_ENABLE_METRICS=OFF flag --- src/model.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/model.py b/src/model.py index 76dac922..650faccd 100644 --- a/src/model.py +++ b/src/model.py @@ -109,6 +109,20 @@ def initialize(self, args): ) self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + # Create vLLM custom metrics + try: + labels = { + "model": self.args["model_name"], + "version": self.args["model_version"], + } + self.metrics = VllmStatLogger(labels=labels) + except pb_utils.TritonModelException as e: + if "metrics not supported" in str(e): + # Metrics are disabled at the server + self.metrics = None + else: + raise e + # Prepare vLLM engine self.init_engine() @@ -153,15 +167,9 @@ def init_engine(self): AsyncEngineArgs(**self.vllm_engine_config) ) - # If TRITON_ENABLE_METRICS<_CPU/_GPU> build flag is enabled. - if self.args["metrics_mode"] in ["all", "cpu", "gpu"]: - # Create vLLM custom Metrics - labels = { - "model": self.args["model_name"], - "version": self.args["model_version"], - } - logger = VllmStatLogger(labels=labels) - self.llm_engine.add_logger("triton", logger) + # Add vLLM custom metrics + if not self.metrics: + self.llm_engine.add_logger("triton", self.metrics) def setup_lora(self): self.enable_lora = False From 07f2575fc6d6ab9a2c23ce936f29659c266a15fd Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 8 Aug 2024 16:31:29 -0700 Subject: [PATCH 13/34] Simply test --- .../metrics_test/vllm_metrics_test.py | 14 +++++--------- src/model.py | 2 +- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index ad0cf990..8284835b 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -70,12 +70,10 @@ def get_metrics(self): return vllm_dict - def vllm_async_stream_infer( + def vllm_infer( self, prompts, sampling_parameters, - stream, - send_parameters_as_tensor, model_name, ): """ @@ -89,15 +87,15 @@ def vllm_async_stream_infer( request_data = create_vllm_request( prompts[i], i, - stream, + False, sampling_parameters, model_name, - send_parameters_as_tensor, + True, ) self.triton_client.async_stream_infer( model_name=model_name, - request_id=request_data["request_id"], inputs=request_data["inputs"], + request_id=request_data["request_id"], outputs=request_data["outputs"], parameters=sampling_parameters, ) @@ -121,11 +119,9 @@ def test_vllm_metrics(self): } # Test vLLM metrics - self.vllm_async_stream_infer( + self.vllm_infer( prompts=self.prompts, sampling_parameters=self.sampling_parameters, - stream=False, - send_parameters_as_tensor=True, model_name=self.vllm_model_name, ) expected_metrics_dict["vllm:prompt_tokens_total"] = 18 diff --git a/src/model.py b/src/model.py index 650faccd..5e77e602 100644 --- a/src/model.py +++ b/src/model.py @@ -168,7 +168,7 @@ def init_engine(self): ) # Add vLLM custom metrics - if not self.metrics: + if self.metrics: self.llm_engine.add_logger("triton", self.metrics) def setup_lora(self): From 21351453547a55e9ac458db4c5d9d491841d1df3 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 9 Aug 2024 07:49:23 -0700 Subject: [PATCH 14/34] Completely turn off metrics --- .../model_repository/vllm_model/1/model.json | 5 ++- src/model.py | 41 ++++++++++--------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/samples/model_repository/vllm_model/1/model.json b/samples/model_repository/vllm_model/1/model.json index 6eb5e070..bd644613 100644 --- a/samples/model_repository/vllm_model/1/model.json +++ b/samples/model_repository/vllm_model/1/model.json @@ -1,6 +1,7 @@ { "model":"facebook/opt-125m", - "disable_log_requests": "true", + "disable_log_requests": true, "gpu_memory_utilization": 0.5, - "enforce_eager": "true" + "enforce_eager": true, + "disable_log_stats": false } diff --git a/src/model.py b/src/model.py index 5e77e602..c9e1cc49 100644 --- a/src/model.py +++ b/src/model.py @@ -109,20 +109,6 @@ def initialize(self, args): ) self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) - # Create vLLM custom metrics - try: - labels = { - "model": self.args["model_name"], - "version": self.args["model_version"], - } - self.metrics = VllmStatLogger(labels=labels) - except pb_utils.TritonModelException as e: - if "metrics not supported" in str(e): - # Metrics are disabled at the server - self.metrics = None - else: - raise e - # Prepare vLLM engine self.init_engine() @@ -163,13 +149,28 @@ def init_engine(self): self.setup_lora() # Create an AsyncLLMEngine from the config from JSON - self.llm_engine = AsyncLLMEngine.from_engine_args( - AsyncEngineArgs(**self.vllm_engine_config) - ) + aync_engine_args = AsyncEngineArgs(**self.vllm_engine_config) + self.llm_engine = AsyncLLMEngine.from_engine_args(aync_engine_args) - # Add vLLM custom metrics - if self.metrics: - self.llm_engine.add_logger("triton", self.metrics) + # Create vLLM custom metrics + if not aync_engine_args.disable_log_stats: + try: + labels = { + "model": self.args["model_name"], + "version": self.args["model_version"], + } + self.metrics = VllmStatLogger(labels=labels) + except pb_utils.TritonModelException as e: + if "metrics not supported" in str(e): + # Metrics are disabled at the server + self.metrics = None + self.logger.log_info("[vllm] Metrics not supported") + else: + raise e + + # Add vLLM custom metrics + if self.metrics: + self.llm_engine.add_logger("triton", self.metrics) def setup_lora(self): self.enable_lora = False From 56aea05e92d755a161d2578d34c290cdb01d5ada Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 9 Aug 2024 11:22:20 -0700 Subject: [PATCH 15/34] Add vLLM disable_log_stats config test --- ci/L0_backend_vllm/metrics_test/test.sh | 33 ++++++++++++++++++- .../metrics_test/vllm_metrics_test.py | 27 ++++++++++----- 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh index 6509b13c..e597b702 100755 --- a/ci/L0_backend_vllm/metrics_test/test.sh +++ b/ci/L0_backend_vllm/metrics_test/test.sh @@ -58,6 +58,7 @@ sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/v RET=0 +# test vLLM metrics run_server if [ "$SERVER_PID" == "0" ]; then cat $SERVER_LOG @@ -66,7 +67,37 @@ if [ "$SERVER_PID" == "0" ]; then fi set +e -python3 $CLIENT_PY -v > $CLIENT_LOG 2>&1 +python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics -v > $CLIENT_LOG 2>&1 + +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Running $CLIENT_PY FAILED. \n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification FAILED.\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# test disabling vLLM metrics with disable_log_stats set to true +sed -i 's/"disable_log_stats": false/"disable_log_stats": true/' models/vllm_opt/1/model.json + +run_server +if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to start $SERVER\n***" + exit 1 +fi + +set +e +python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled -v > $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index 8284835b..48c5da5c 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -50,7 +50,7 @@ def setUp(self): ] self.sampling_parameters = {"temperature": "0", "top_p": "1"} - def get_metrics(self): + def get_vllm_metrics(self): """ Store vllm metrics in a dictionary. """ @@ -112,21 +112,30 @@ def vllm_infer( self.triton_client.stop_stream() def test_vllm_metrics(self): - # All vLLM metrics from tritonserver - expected_metrics_dict = { - "vllm:prompt_tokens_total": 0, - "vllm:generation_tokens_total": 0, - } + # Test vLLM metrics + self.vllm_infer( + prompts=self.prompts, + sampling_parameters=self.sampling_parameters, + model_name=self.vllm_model_name, + ) + metrics_dict = self.get_vllm_metrics() + # vllm:prompt_tokens_total + self.assertEqual(metrics_dict["vllm:prompt_tokens_total"], 18) + # vllm:generation_tokens_total + self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48) + + def test_vllm_metrics_disabled(self): # Test vLLM metrics self.vllm_infer( prompts=self.prompts, sampling_parameters=self.sampling_parameters, model_name=self.vllm_model_name, ) - expected_metrics_dict["vllm:prompt_tokens_total"] = 18 - expected_metrics_dict["vllm:generation_tokens_total"] = 48 - self.assertEqual(self.get_metrics(), expected_metrics_dict) + metrics_dict = self.get_vllm_metrics() + + # No vLLM metric found + self.assertEqual(len(metrics_dict), 0) def tearDown(self): self.triton_client.close() From 0dadc8ef5e67950a526b648c83654ba029545431 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 9 Aug 2024 11:52:58 -0700 Subject: [PATCH 16/34] Test metrics are enabled by default if disable_log_stats is not set. --- ci/L0_backend_vllm/metrics_test/test.sh | 37 +++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh index e597b702..990a42a2 100755 --- a/ci/L0_backend_vllm/metrics_test/test.sh +++ b/ci/L0_backend_vllm/metrics_test/test.sh @@ -58,7 +58,7 @@ sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/v RET=0 -# test vLLM metrics +# Test vLLM metrics run_server if [ "$SERVER_PID" == "0" ]; then cat $SERVER_LOG @@ -86,7 +86,7 @@ set -e kill $SERVER_PID wait $SERVER_PID -# test disabling vLLM metrics with disable_log_stats set to true +# Test disabling vLLM metrics with disable_log_stats set to true sed -i 's/"disable_log_stats": false/"disable_log_stats": true/' models/vllm_opt/1/model.json run_server @@ -115,7 +115,38 @@ set -e kill $SERVER_PID wait $SERVER_PID -rm -rf "./models" + +# Test vLLM metrics if disable_log_stats is not set in model.json +jq 'del(.disable_log_stats)' models/vllm_opt/1/model.json > "temp.json" +mv temp.json models/vllm_opt/1/model.json + +run_server +if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to start $SERVER\n***" + exit 1 +fi + +set +e +python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics -v > $CLIENT_LOG 2>&1 + +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Running $CLIENT_PY FAILED. \n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification FAILED.\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID +rm -rf "./models" "temp.json" if [ $RET -eq 1 ]; then cat $CLIENT_LOG From 8d8fd2adecffe7c416709247877918200afc49ed Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 9 Aug 2024 12:19:52 -0700 Subject: [PATCH 17/34] Update tests based on comments --- ci/L0_backend_vllm/metrics_test/test.sh | 8 ++++---- ci/L0_backend_vllm/vllm_backend/test.sh | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh index 990a42a2..9340ded7 100755 --- a/ci/L0_backend_vllm/metrics_test/test.sh +++ b/ci/L0_backend_vllm/metrics_test/test.sh @@ -50,7 +50,7 @@ function assert_curl_success { rm -rf models && mkdir -p models cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt -# `vllm_opt`` model will be loaded on server start and stay loaded throughout +# `vllm_opt` model will be loaded on server start and stay loaded throughout # unittesting. To ensure that vllm's memory profiler will not error out # on `vllm_load_test` load, we reduce "gpu_memory_utilization" for `vllm_opt`, # so that at least 60% of GPU memory was available for other models. @@ -71,7 +71,7 @@ python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics -v > $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG - echo -e "\n***\n*** Running $CLIENT_PY FAILED. \n***" + echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics FAILED. \n***" RET=1 else check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS @@ -101,7 +101,7 @@ python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled -v > $CLIENT if [ $? -ne 0 ]; then cat $CLIENT_LOG - echo -e "\n***\n*** Running $CLIENT_PY FAILED. \n***" + echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled FAILED. \n***" RET=1 else check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS @@ -132,7 +132,7 @@ python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics -v > $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG - echo -e "\n***\n*** Running $CLIENT_PY FAILED. \n***" + echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics FAILED. \n***" RET=1 else check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS diff --git a/ci/L0_backend_vllm/vllm_backend/test.sh b/ci/L0_backend_vllm/vllm_backend/test.sh index a6dd0aa7..43b20af7 100755 --- a/ci/L0_backend_vllm/vllm_backend/test.sh +++ b/ci/L0_backend_vllm/vllm_backend/test.sh @@ -50,7 +50,7 @@ function assert_curl_success { rm -rf models && mkdir -p models cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt -# `vllm_opt`` model will be loaded on server start and stay loaded throughout +# `vllm_opt` model will be loaded on server start and stay loaded throughout # unittesting. To test vllm model load/unload we use a dedicated # `vllm_load_test`. To ensure that vllm's memory profiler will not error out # on `vllm_load_test` load, we reduce "gpu_memory_utilization" for `vllm_opt`, From 4f2e217151b04ea3f8e402e8ba0ac6e11a0d0e7f Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 9 Aug 2024 14:22:02 -0700 Subject: [PATCH 18/34] Remove _log_gauge --- src/utils/metrics.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/utils/metrics.py b/src/utils/metrics.py index e8c58372..3b486c10 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -69,18 +69,6 @@ def __init__(self, labels: Dict, local_interval: float = 0) -> None: def info(self, type: str, obj: SupportsMetricsInfo) -> None: raise NotImplementedError - def _log_gauge(self, gauge, data: Union[int, float]) -> None: - """Convenience function for logging to gauge. - - Args: - gauge: A gauge metric instance. - data: An int or float to set the gauge metric. - - Returns: - None - """ - gauge.set(data) - def _log_counter(self, counter, data: Union[int, float]) -> None: """Convenience function for logging to counter. From d22fd03dc781490efdbf9b6edab1fc4a1f5330ae Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 9 Aug 2024 14:56:44 -0700 Subject: [PATCH 19/34] Resolve comments --- README.md | 9 +++++---- src/model.py | 8 ++------ src/utils/metrics.py | 2 +- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 802f4f4c..6f347a8c 100644 --- a/README.md +++ b/README.md @@ -110,10 +110,11 @@ In this case, please install vLLM first. You can do so by running container with the following commands: ``` -mkdir -p /opt/tritonserver/backends/vllm -git clone https://github.com/triton-inference-server/vllm_backend.git /opt/tritonserver/backends/vllm/vllm_backend -cp -r /opt/tritonserver/backends/vllm/vllm_backend/src/* /opt/tritonserver/backends/vllm -rm -rf /opt/tritonserver/backends/vllm/vllm_backend +vllm_tmp_dir=/tmp/backends/vllm_backend +mkdir -p /opt/tritonserver/backends/vllm $vllm_tmp_dir +git clone https://github.com/triton-inference-server/vllm_backend.git $vllm_tmp_dir +cp -r $vllm_tmp_dir/src/* /opt/tritonserver/backends/vllm +rm -rf $vllm_tmp_dir ``` ## Using the vLLM Backend diff --git a/src/model.py b/src/model.py index c9e1cc49..3e42d56d 100644 --- a/src/model.py +++ b/src/model.py @@ -159,19 +159,15 @@ def init_engine(self): "model": self.args["model_name"], "version": self.args["model_version"], } - self.metrics = VllmStatLogger(labels=labels) + # Add vLLM custom metrics + self.llm_engine.add_logger("triton", VllmStatLogger(labels=labels)) except pb_utils.TritonModelException as e: if "metrics not supported" in str(e): # Metrics are disabled at the server - self.metrics = None self.logger.log_info("[vllm] Metrics not supported") else: raise e - # Add vLLM custom metrics - if self.metrics: - self.llm_engine.add_logger("triton", self.metrics) - def setup_lora(self): self.enable_lora = False diff --git a/src/utils/metrics.py b/src/utils/metrics.py index 3b486c10..36cf73bf 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -67,7 +67,7 @@ def __init__(self, labels: Dict, local_interval: float = 0) -> None: self.metrics = TritonMetrics(labels=labels) def info(self, type: str, obj: SupportsMetricsInfo) -> None: - raise NotImplementedError + pass def _log_counter(self, counter, data: Union[int, float]) -> None: """Convenience function for logging to counter. From 8280d261346ca947ec11e97418da66f1ec2e80a6 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 9 Aug 2024 15:47:24 -0700 Subject: [PATCH 20/34] Update --- README.md | 11 ++++++----- src/utils/metrics.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 6f347a8c..5c89b550 100644 --- a/README.md +++ b/README.md @@ -110,11 +110,12 @@ In this case, please install vLLM first. You can do so by running container with the following commands: ``` -vllm_tmp_dir=/tmp/backends/vllm_backend -mkdir -p /opt/tritonserver/backends/vllm $vllm_tmp_dir -git clone https://github.com/triton-inference-server/vllm_backend.git $vllm_tmp_dir -cp -r $vllm_tmp_dir/src/* /opt/tritonserver/backends/vllm -rm -rf $vllm_tmp_dir +TEMP_TRITON_VLLM_DIR=/tmp/backends/vllm_backend +mkdir -p /opt/tritonserver/backends/vllm $TEMP_TRITON_VLLM_DIR +git clone https://github.com/triton-inference-server/vllm_backend.git $TEMP_TRITON_VLLM_DIR +cp -r $TEMP_TRITON_VLLM_DIR/src/* /opt/tritonserver/backends/vllm +rm -rf $TEMP_TRITON_VLLM_DIR +rm -d /tmp/backends ``` ## Using the vLLM Backend diff --git a/src/utils/metrics.py b/src/utils/metrics.py index 36cf73bf..41604f7c 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -83,7 +83,7 @@ def _log_counter(self, counter, data: Union[int, float]) -> None: counter.increment(data) def log(self, stats: VllmStats) -> None: - """Logs tracked stats to triton metrics server every iteration. + """Logs to triton metrics server every iteration. Args: stats: Created by LLMEngine for use by VllmStatLogger. From 6fa7ae36b032c85ef897d97e59ebdbae8d8b9d6c Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 9 Aug 2024 16:06:33 -0700 Subject: [PATCH 21/34] Change temp directory --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 5c89b550..6269fb58 100644 --- a/README.md +++ b/README.md @@ -110,12 +110,11 @@ In this case, please install vLLM first. You can do so by running container with the following commands: ``` -TEMP_TRITON_VLLM_DIR=/tmp/backends/vllm_backend +TEMP_TRITON_VLLM_DIR=/tmp/vllm_backend mkdir -p /opt/tritonserver/backends/vllm $TEMP_TRITON_VLLM_DIR git clone https://github.com/triton-inference-server/vllm_backend.git $TEMP_TRITON_VLLM_DIR cp -r $TEMP_TRITON_VLLM_DIR/src/* /opt/tritonserver/backends/vllm rm -rf $TEMP_TRITON_VLLM_DIR -rm -d /tmp/backends ``` ## Using the vLLM Backend From 89ca6f48cce89dc875c2f6ec3a30d7ffbd7d8859 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Wed, 14 Aug 2024 19:57:05 -0700 Subject: [PATCH 22/34] Disable metrics report by default. Controlled by parameter "REPORT_METRICS" in config.pbtxt. --- ci/L0_backend_vllm/metrics_test/test.sh | 39 ++++++++++++++----------- src/model.py | 6 +++- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh index 9340ded7..f019bdbd 100755 --- a/ci/L0_backend_vllm/metrics_test/test.sh +++ b/ci/L0_backend_vllm/metrics_test/test.sh @@ -39,15 +39,6 @@ SAMPLE_MODELS_REPO="../../../samples/model_repository" EXPECTED_NUM_TESTS=1 # Helpers ======================================= -function assert_curl_success { - message="${1}" - if [ "$code" != "200" ]; then - cat ./curl.out - echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***" - RET=1 - fi -} - rm -rf models && mkdir -p models cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt # `vllm_opt` model will be loaded on server start and stay loaded throughout @@ -58,7 +49,7 @@ sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/v RET=0 -# Test vLLM metrics +# Test disabling vLLM metrics reporting without parameter "REPORT_METRICS" in config.pbtxt run_server if [ "$SERVER_PID" == "0" ]; then cat $SERVER_LOG @@ -67,11 +58,11 @@ if [ "$SERVER_PID" == "0" ]; then fi set +e -python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics -v > $CLIENT_LOG 2>&1 +python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled -v > $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG - echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics FAILED. \n***" + echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled FAILED. \n***" RET=1 else check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS @@ -86,8 +77,15 @@ set -e kill $SERVER_PID wait $SERVER_PID -# Test disabling vLLM metrics with disable_log_stats set to true -sed -i 's/"disable_log_stats": false/"disable_log_stats": true/' models/vllm_opt/1/model.json +# Test disabling vLLM metrics reporting with parameter "REPORT_METRICS" set to "no" in config.pbtxt +echo -e " +parameters: { + key: \"REPORT_METRICS\" + value: { + string_value:\"no\" + } +} +" >> models/vllm_opt/config.pbtxt run_server if [ "$SERVER_PID" == "0" ]; then @@ -116,9 +114,16 @@ set -e kill $SERVER_PID wait $SERVER_PID -# Test vLLM metrics if disable_log_stats is not set in model.json -jq 'del(.disable_log_stats)' models/vllm_opt/1/model.json > "temp.json" -mv temp.json models/vllm_opt/1/model.json +# Test vLLM metrics reporting with parameter "REPORT_METRICS" set to "no" in config.pbtxt +cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt +echo -e " +parameters: { + key: \"REPORT_METRICS\" + value: { + string_value:\"yes\" + } +} +" >> models/vllm_opt/config.pbtxt run_server if [ "$SERVER_PID" == "0" ]; then diff --git a/src/model.py b/src/model.py index ef130333..c3bd5dfa 100644 --- a/src/model.py +++ b/src/model.py @@ -161,7 +161,11 @@ def init_engine(self): self.llm_engine = AsyncLLMEngine.from_engine_args(aync_engine_args) # Create vLLM custom metrics - if not aync_engine_args.disable_log_stats: + if ( + "REPORT_METRICS" in self.model_config["parameters"] + and self.model_config["parameters"]["REPORT_METRICS"]["string_value"] + == "yes" + ): try: labels = { "model": self.args["model_name"], From 1158feeb00eeef719a9b872460ef59ef1fcb3a3b Mon Sep 17 00:00:00 2001 From: Yingge He Date: Wed, 14 Aug 2024 23:11:17 -0700 Subject: [PATCH 23/34] Test server option set --allow-metrics=false --- ci/L0_backend_vllm/metrics_test/test.sh | 31 ++++++++++++++++++- .../metrics_test/vllm_metrics_test.py | 10 ++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh index f019bdbd..cb856f0e 100755 --- a/ci/L0_backend_vllm/metrics_test/test.sh +++ b/ci/L0_backend_vllm/metrics_test/test.sh @@ -114,7 +114,7 @@ set -e kill $SERVER_PID wait $SERVER_PID -# Test vLLM metrics reporting with parameter "REPORT_METRICS" set to "no" in config.pbtxt +# Test vLLM metrics reporting with parameter "REPORT_METRICS" set to "yes" in config.pbtxt cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt echo -e " parameters: { @@ -149,6 +149,35 @@ else fi set -e +kill $SERVER_PID +wait $SERVER_PID + +# Test enabling vLLM metrics reporting in config.pbtxt while disabling in server option +SERVER_ARGS="${SERVER_ARGS} --allow-metrics=false" +run_server +if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to start $SERVER\n***" + exit 1 +fi + +set +e +python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_refused -v > $CLIENT_LOG 2>&1 + +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_refused FAILED. \n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification FAILED.\n***" + RET=1 + fi +fi +set -e + kill $SERVER_PID wait $SERVER_PID rm -rf "./models" "temp.json" diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index 48c5da5c..d2059057 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -137,6 +137,16 @@ def test_vllm_metrics_disabled(self): # No vLLM metric found self.assertEqual(len(metrics_dict), 0) + def test_vllm_metrics_refused(self): + # Test vLLM metrics + self.vllm_infer( + prompts=self.prompts, + sampling_parameters=self.sampling_parameters, + model_name=self.vllm_model_name, + ) + with self.assertRaises(requests.exceptions.ConnectionError): + self.get_vllm_metrics() + def tearDown(self): self.triton_client.close() From a99d38b8a4019f5c1b289682f63fd0d23bc57f08 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 15 Aug 2024 01:32:42 -0700 Subject: [PATCH 24/34] Add docs --- README.md | 35 ++++++++++++++++++- ci/L0_backend_vllm/metrics_test/test.sh | 10 +++--- .../model_repository/vllm_model/1/model.json | 3 +- src/model.py | 4 +-- 4 files changed, 42 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 6269fb58..2da0f36a 100644 --- a/README.md +++ b/README.md @@ -197,7 +197,6 @@ starting from 23.10 release. You can use `pip install ...` within the container to upgrade vLLM version. - ## Running Multiple Instances of Triton Server If you are running multiple instances of Triton server with a Python-based backend, @@ -205,6 +204,40 @@ you need to specify a different `shm-region-prefix-name` for each server. See [here](https://github.com/triton-inference-server/python_backend#running-multiple-instances-of-triton-server) for more information. +## Triton Metrics +Starting with the 24.08 release of Triton, users can now obtain partial +vLLM metrics by querying the Triton metrics endpoint (see complete vLLM metrics +[here](https://docs.vllm.ai/en/latest/serving/metrics.html)). This can be +accomplished by launching a Triton server in any of the ways described above +(ensuring the build code / container is 24.08 or later) and querying the server. +Upon receiving a successful response, you can query the metrics endpoint by entering +the following: +```bash +curl localhost:8002/metrics +``` +VLLM stats are reported by the metrics endpoint in fields that +are prefixed with `vllm:`. Your output for these fields should look +similar to the following: +```bash +# HELP vllm:prompt_tokens_total Number of prefill tokens processed. +# TYPE vllm:prompt_tokens_total counter +vllm:prompt_tokens_total{model="vllm_model",version="1"} 10 +# HELP vllm:generation_tokens_total Number of generation tokens processed. +# TYPE vllm:generation_tokens_total counter +vllm:generation_tokens_total{model="vllm_model",version="1"} 16 +``` +*Note:* The vLLM metrics reporting is disabled by default due to potential +performance slowdowns. To enable vLLM model's metrics reporting, please add +following lines to its config.pbtxt. +```bash +parameters: { + key: "REPORT_CUSTOM_METRICS" + value: { + string_value:"yes" + } +} +``` + ## Referencing the Tutorial You can read further in the diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh index cb856f0e..c9a7810a 100755 --- a/ci/L0_backend_vllm/metrics_test/test.sh +++ b/ci/L0_backend_vllm/metrics_test/test.sh @@ -49,7 +49,7 @@ sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/v RET=0 -# Test disabling vLLM metrics reporting without parameter "REPORT_METRICS" in config.pbtxt +# Test disabling vLLM metrics reporting without parameter "REPORT_CUSTOM_METRICS" in config.pbtxt run_server if [ "$SERVER_PID" == "0" ]; then cat $SERVER_LOG @@ -77,10 +77,10 @@ set -e kill $SERVER_PID wait $SERVER_PID -# Test disabling vLLM metrics reporting with parameter "REPORT_METRICS" set to "no" in config.pbtxt +# Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "no" in config.pbtxt echo -e " parameters: { - key: \"REPORT_METRICS\" + key: \"REPORT_CUSTOM_METRICS\" value: { string_value:\"no\" } @@ -114,11 +114,11 @@ set -e kill $SERVER_PID wait $SERVER_PID -# Test vLLM metrics reporting with parameter "REPORT_METRICS" set to "yes" in config.pbtxt +# Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "yes" in config.pbtxt cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt echo -e " parameters: { - key: \"REPORT_METRICS\" + key: \"REPORT_CUSTOM_METRICS\" value: { string_value:\"yes\" } diff --git a/samples/model_repository/vllm_model/1/model.json b/samples/model_repository/vllm_model/1/model.json index bd644613..8a32050d 100644 --- a/samples/model_repository/vllm_model/1/model.json +++ b/samples/model_repository/vllm_model/1/model.json @@ -2,6 +2,5 @@ "model":"facebook/opt-125m", "disable_log_requests": true, "gpu_memory_utilization": 0.5, - "enforce_eager": true, - "disable_log_stats": false + "enforce_eager": true } diff --git a/src/model.py b/src/model.py index c3bd5dfa..653ddb32 100644 --- a/src/model.py +++ b/src/model.py @@ -162,8 +162,8 @@ def init_engine(self): # Create vLLM custom metrics if ( - "REPORT_METRICS" in self.model_config["parameters"] - and self.model_config["parameters"]["REPORT_METRICS"]["string_value"] + "REPORT_CUSTOM_METRICS" in self.model_config["parameters"] + and self.model_config["parameters"]["REPORT_CUSTOM_METRICS"]["string_value"] == "yes" ): try: From de8f25b0bcc9ebdffac9932f68190da918dbebfb Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 15 Aug 2024 02:05:04 -0700 Subject: [PATCH 25/34] Minor update --- README.md | 8 +++----- src/utils/metrics.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2da0f36a..f00a3b1e 100644 --- a/README.md +++ b/README.md @@ -110,11 +110,9 @@ In this case, please install vLLM first. You can do so by running container with the following commands: ``` -TEMP_TRITON_VLLM_DIR=/tmp/vllm_backend -mkdir -p /opt/tritonserver/backends/vllm $TEMP_TRITON_VLLM_DIR -git clone https://github.com/triton-inference-server/vllm_backend.git $TEMP_TRITON_VLLM_DIR -cp -r $TEMP_TRITON_VLLM_DIR/src/* /opt/tritonserver/backends/vllm -rm -rf $TEMP_TRITON_VLLM_DIR +mkdir -p /opt/tritonserver/backends/vllm +git clone https://github.com/triton-inference-server/vllm_backend.git /tmp/vllm_backend +cp -r /tmp/vllm_backend/src/* /opt/tritonserver/backends/vllm ``` ## Using the vLLM Backend diff --git a/src/utils/metrics.py b/src/utils/metrics.py index 41604f7c..fc6e69bd 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -83,7 +83,7 @@ def _log_counter(self, counter, data: Union[int, float]) -> None: counter.increment(data) def log(self, stats: VllmStats) -> None: - """Logs to triton metrics server every iteration. + """Report stats to Triton metrics server. Args: stats: Created by LLMEngine for use by VllmStatLogger. From b1333cefee61123a653e16ade27914f40882a93d Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 15 Aug 2024 13:20:25 -0700 Subject: [PATCH 26/34] Both args checking --- README.md | 11 ++-- ci/L0_backend_vllm/metrics_test/test.sh | 68 ++++++++++++++++++++++--- src/model.py | 1 + 3 files changed, 70 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index f00a3b1e..4a7d2878 100644 --- a/README.md +++ b/README.md @@ -224,9 +224,14 @@ vllm:prompt_tokens_total{model="vllm_model",version="1"} 10 # TYPE vllm:generation_tokens_total counter vllm:generation_tokens_total{model="vllm_model",version="1"} 16 ``` -*Note:* The vLLM metrics reporting is disabled by default due to potential -performance slowdowns. To enable vLLM model's metrics reporting, please add -following lines to its config.pbtxt. +To enable vLLM engine colleting metrics, "disable_log_stats" option need to be either false +or left empty (false by default) in [model.json](https://github.com/triton-inference-server/vllm_backend/blob/main/samples/model_repository/vllm_model/1/model.json). +```bash +"disable_log_stats": false +``` +*Note:* vLLM metrics are not reported to Triton metrics server by default +due to potential performance slowdowns. To enable vLLM model's metrics +reporting, please add following lines to its config.pbtxt as well. ```bash parameters: { key: "REPORT_CUSTOM_METRICS" diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh index c9a7810a..0a8a96d6 100755 --- a/ci/L0_backend_vllm/metrics_test/test.sh +++ b/ci/L0_backend_vllm/metrics_test/test.sh @@ -39,17 +39,20 @@ SAMPLE_MODELS_REPO="../../../samples/model_repository" EXPECTED_NUM_TESTS=1 # Helpers ======================================= -rm -rf models && mkdir -p models -cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt -# `vllm_opt` model will be loaded on server start and stay loaded throughout -# unittesting. To ensure that vllm's memory profiler will not error out -# on `vllm_load_test` load, we reduce "gpu_memory_utilization" for `vllm_opt`, -# so that at least 60% of GPU memory was available for other models. -sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json +function copy_model_repository { + rm -rf models && mkdir -p models + cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt + # `vllm_opt` model will be loaded on server start and stay loaded throughout + # unittesting. To ensure that vllm's memory profiler will not error out + # on `vllm_load_test` load, we reduce "gpu_memory_utilization" for `vllm_opt`, + # so that at least 60% of GPU memory was available for other models. + sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json +} RET=0 # Test disabling vLLM metrics reporting without parameter "REPORT_CUSTOM_METRICS" in config.pbtxt +copy_model_repository run_server if [ "$SERVER_PID" == "0" ]; then cat $SERVER_LOG @@ -78,6 +81,7 @@ kill $SERVER_PID wait $SERVER_PID # Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "no" in config.pbtxt +copy_model_repository echo -e " parameters: { key: \"REPORT_CUSTOM_METRICS\" @@ -115,6 +119,7 @@ kill $SERVER_PID wait $SERVER_PID # Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "yes" in config.pbtxt +copy_model_repository cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt echo -e " parameters: { @@ -152,7 +157,56 @@ set -e kill $SERVER_PID wait $SERVER_PID +# Test enabling vLLM metrics reporting in config.pbtxt but disabling in model.json +copy_model_repository +jq '. += {"disable_log_stats" : true}' models/vllm_opt/1/model.json > "temp.json" +mv temp.json models/vllm_opt/1/model.json +echo -e " +parameters: { + key: \"REPORT_CUSTOM_METRICS\" + value: { + string_value:\"yes\" + } +} +" >> models/vllm_opt/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to start $SERVER\n***" + exit 1 +fi + +set +e +python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled -v > $CLIENT_LOG 2>&1 + +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled FAILED. \n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification FAILED.\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + # Test enabling vLLM metrics reporting in config.pbtxt while disabling in server option +copy_model_repository +echo -e " +parameters: { + key: \"REPORT_CUSTOM_METRICS\" + value: { + string_value:\"yes\" + } +} +" >> models/vllm_opt/config.pbtxt SERVER_ARGS="${SERVER_ARGS} --allow-metrics=false" run_server if [ "$SERVER_PID" == "0" ]; then diff --git a/src/model.py b/src/model.py index 653ddb32..cc52c3b5 100644 --- a/src/model.py +++ b/src/model.py @@ -165,6 +165,7 @@ def init_engine(self): "REPORT_CUSTOM_METRICS" in self.model_config["parameters"] and self.model_config["parameters"]["REPORT_CUSTOM_METRICS"]["string_value"] == "yes" + and not aync_engine_args.disable_log_stats ): try: labels = { From de7ff8f6b1d7f54c234e263cdc80d8fd6aa49092 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Sun, 11 Aug 2024 17:53:44 -0700 Subject: [PATCH 27/34] Add histogram test --- .../metrics_test/vllm_metrics_test.py | 11 +++ src/utils/metrics.py | 75 ++++++++++++++++++- 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index d2059057..fbe6675f 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -125,6 +125,17 @@ def test_vllm_metrics(self): # vllm:generation_tokens_total self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48) + # vllm:time_to_first_token_seconds + self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_count"], 3) + self.assertTrue( + 0 < metrics_dict["vllm:time_to_first_token_seconds_sum"] < 0.0005 + ) + # vllm:time_per_output_token_seconds + self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45) + self.assertTrue( + 0 <= metrics_dict["vllm:time_per_output_token_seconds_sum"] <= 0.005 + ) + def test_vllm_metrics_disabled(self): # Test vLLM metrics self.vllm_infer( diff --git a/src/utils/metrics.py b/src/utils/metrics.py index fc6e69bd..0374fa3b 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -24,7 +24,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from typing import Dict, Union +from typing import Dict, List, Union import triton_python_backend_utils as pb_utils from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase @@ -46,6 +46,16 @@ def __init__(self, labels): description="Number of generation tokens processed.", kind=pb_utils.MetricFamily.COUNTER, ) + self.histogram_time_to_first_token_family = pb_utils.MetricFamily( + name="vllm:time_to_first_token_seconds", + description="Histogram of time to first token in seconds.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + self.histogram_time_per_output_token_family = pb_utils.MetricFamily( + name="vllm:time_per_output_token_seconds", + description="Histogram of time per output token in seconds.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) # Initialize metrics # Iteration stats @@ -55,6 +65,49 @@ def __init__(self, labels): self.counter_generation_tokens = self.counter_generation_tokens_family.Metric( labels=labels ) + self.histogram_time_to_first_token = ( + self.histogram_time_to_first_token_family.Metric( + labels=labels, + buckets=[ + 0.001, + 0.005, + 0.01, + 0.02, + 0.04, + 0.06, + 0.08, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + ], + ) + ) + self.histogram_time_per_output_token = ( + self.histogram_time_per_output_token_family.Metric( + labels=labels, + buckets=[ + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.15, + 0.2, + 0.3, + 0.4, + 0.5, + 0.75, + 1.0, + 2.5, + ], + ) + ) class VllmStatLogger(VllmStatLoggerBase): @@ -82,6 +135,19 @@ def _log_counter(self, counter, data: Union[int, float]) -> None: if data != 0: counter.increment(data) + def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None: + """Convenience function for logging list to histogram. + + Args: + histogram: A histogram metric instance. + data: A list of int or float data to observe into the histogram metric. + + Returns: + None + """ + for datum in data: + histogram.observe(datum) + def log(self, stats: VllmStats) -> None: """Report stats to Triton metrics server. @@ -97,3 +163,10 @@ def log(self, stats: VllmStats) -> None: self._log_counter( self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter ) + self._log_histogram( + self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter + ) + self._log_histogram( + self.metrics.histogram_time_per_output_token, + stats.time_per_output_tokens_iter, + ) From 9534298678d8ef855e197fd48f35fea7045138ef Mon Sep 17 00:00:00 2001 From: Yingge He Date: Wed, 14 Aug 2024 10:59:48 -0700 Subject: [PATCH 28/34] Longer time for A100 --- ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index fbe6675f..bea63ede 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -127,14 +127,14 @@ def test_vllm_metrics(self): # vllm:time_to_first_token_seconds self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_count"], 3) - self.assertTrue( - 0 < metrics_dict["vllm:time_to_first_token_seconds_sum"] < 0.0005 - ) + self.assertTrue(0 < metrics_dict["vllm:time_to_first_token_seconds_sum"] < 0.01) + self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_bucket"], 3) # vllm:time_per_output_token_seconds self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45) self.assertTrue( - 0 <= metrics_dict["vllm:time_per_output_token_seconds_sum"] <= 0.005 + 0 < metrics_dict["vllm:time_per_output_token_seconds_sum"] < 0.1 ) + self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_bucket"], 45) def test_vllm_metrics_disabled(self): # Test vLLM metrics From 38ac8d6435bd884d21f50b41f186c26c129bc6be Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 15 Aug 2024 14:13:48 -0700 Subject: [PATCH 29/34] Update comment --- src/utils/metrics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/utils/metrics.py b/src/utils/metrics.py index 0374fa3b..5f007b02 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -65,6 +65,8 @@ def __init__(self, labels): self.counter_generation_tokens = self.counter_generation_tokens_family.Metric( labels=labels ) + # Use the same bucket boundaries from vLLM sample metrics. + # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96 self.histogram_time_to_first_token = ( self.histogram_time_to_first_token_family.Metric( labels=labels, From ebdf14eefa07cd91b48f878299d149bab5e0e6de Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 15 Aug 2024 15:51:05 -0700 Subject: [PATCH 30/34] Add histogram metrics to doc --- README.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/README.md b/README.md index 4a7d2878..410205be 100644 --- a/README.md +++ b/README.md @@ -223,6 +223,45 @@ vllm:prompt_tokens_total{model="vllm_model",version="1"} 10 # HELP vllm:generation_tokens_total Number of generation tokens processed. # TYPE vllm:generation_tokens_total counter vllm:generation_tokens_total{model="vllm_model",version="1"} 16 +# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds. +# TYPE vllm:time_to_first_token_seconds histogram +vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1 +vllm:time_to_first_token_seconds_sum{model="vllm_model",version="1"} 0.03233122825622559 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.001"} 0 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.005"} 0 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 0 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.02"} 0 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.04"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.06"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.08"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.1"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.25"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.5"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.75"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="1"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="2.5"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="5"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="7.5"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="10"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 +# HELP vllm:time_per_output_token_seconds Histogram of time per output token in seconds. +# TYPE vllm:time_per_output_token_seconds histogram +vllm:time_per_output_token_seconds_count{model="vllm_model",version="1"} 15 +vllm:time_per_output_token_seconds_sum{model="vllm_model",version="1"} 0.04501533508300781 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 14 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.025"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.05"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.075"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.1"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.15"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.2"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.3"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.4"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.5"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.75"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="1"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="2.5"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 15 ``` To enable vLLM engine colleting metrics, "disable_log_stats" option need to be either false or left empty (false by default) in [model.json](https://github.com/triton-inference-server/vllm_backend/blob/main/samples/model_repository/vllm_model/1/model.json). From 0d67322d3df4959c63109bcad6c9aefb0168b536 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 16 Aug 2024 11:33:09 -0700 Subject: [PATCH 31/34] Update docs --- README.md | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 410205be..a446cbd9 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ you need to specify a different `shm-region-prefix-name` for each server. See for more information. ## Triton Metrics -Starting with the 24.08 release of Triton, users can now obtain partial +Starting with the 24.08 release of Triton, users can now obtain specific vLLM metrics by querying the Triton metrics endpoint (see complete vLLM metrics [here](https://docs.vllm.ai/en/latest/serving/metrics.html)). This can be accomplished by launching a Triton server in any of the ways described above @@ -213,9 +213,19 @@ the following: ```bash curl localhost:8002/metrics ``` -VLLM stats are reported by the metrics endpoint in fields that -are prefixed with `vllm:`. Your output for these fields should look -similar to the following: +VLLM stats are reported by the metrics endpoint in fields that are prefixed with +`vllm:`. Triton currently supports reporting of the following metrics from vLLM. +```bash +# Number of prefill tokens processed. +counter_prompt_tokens +# Number of generation tokens processed. +counter_generation_tokens +# Histogram of time to first token in seconds. +histogram_time_to_first_token +# Histogram of time per output token in seconds. +histogram_time_per_output_token +``` +Your output for these fields should look similar to the following: ```bash # HELP vllm:prompt_tokens_total Number of prefill tokens processed. # TYPE vllm:prompt_tokens_total counter @@ -229,20 +239,7 @@ vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1 vllm:time_to_first_token_seconds_sum{model="vllm_model",version="1"} 0.03233122825622559 vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.001"} 0 vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.005"} 0 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 0 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.02"} 0 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.04"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.06"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.08"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.1"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.25"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.5"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.75"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="1"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="2.5"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="5"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="7.5"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="10"} 1 +... vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 # HELP vllm:time_per_output_token_seconds Histogram of time per output token in seconds. # TYPE vllm:time_per_output_token_seconds histogram @@ -250,17 +247,7 @@ vllm:time_per_output_token_seconds_count{model="vllm_model",version="1"} 15 vllm:time_per_output_token_seconds_sum{model="vllm_model",version="1"} 0.04501533508300781 vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 14 vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.025"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.05"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.075"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.1"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.15"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.2"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.3"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.4"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.5"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.75"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="1"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="2.5"} 15 +... vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 15 ``` To enable vLLM engine colleting metrics, "disable_log_stats" option need to be either false From 10d8a695c539297d10c156c1818042caa95583b0 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 16 Aug 2024 14:07:07 -0700 Subject: [PATCH 32/34] Make metrics test more robust --- ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index bea63ede..dbb6124c 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -127,13 +127,11 @@ def test_vllm_metrics(self): # vllm:time_to_first_token_seconds self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_count"], 3) - self.assertTrue(0 < metrics_dict["vllm:time_to_first_token_seconds_sum"] < 0.01) + self.assertTrue(metrics_dict["vllm:time_to_first_token_seconds_sum"] > 0) self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_bucket"], 3) # vllm:time_per_output_token_seconds self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45) - self.assertTrue( - 0 < metrics_dict["vllm:time_per_output_token_seconds_sum"] < 0.1 - ) + self.assertTrue(metrics_dict["vllm:time_per_output_token_seconds_sum"] > 0) self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_bucket"], 45) def test_vllm_metrics_disabled(self): From bba181e18fbe84affe41a0a5ca32e9f8110c3d94 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 16 Aug 2024 14:42:39 -0700 Subject: [PATCH 33/34] Remove empty newline --- src/utils/metrics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/utils/metrics.py b/src/utils/metrics.py index 14469f66..5f007b02 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -65,7 +65,6 @@ def __init__(self, labels): self.counter_generation_tokens = self.counter_generation_tokens_family.Metric( labels=labels ) - # Use the same bucket boundaries from vLLM sample metrics. # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96 self.histogram_time_to_first_token = ( From 2d7488d71c5052f25678f48f3d127f176e10e1ba Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 16 Aug 2024 14:59:26 -0700 Subject: [PATCH 34/34] Change assertTrue(a>b) to assertGreater(a, b) --- ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index dbb6124c..db72a57a 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -127,11 +127,11 @@ def test_vllm_metrics(self): # vllm:time_to_first_token_seconds self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_count"], 3) - self.assertTrue(metrics_dict["vllm:time_to_first_token_seconds_sum"] > 0) + self.assertGreater(metrics_dict["vllm:time_to_first_token_seconds_sum"], 0) self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_bucket"], 3) # vllm:time_per_output_token_seconds self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45) - self.assertTrue(metrics_dict["vllm:time_per_output_token_seconds_sum"] > 0) + self.assertGreater(metrics_dict["vllm:time_per_output_token_seconds_sum"], 0) self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_bucket"], 45) def test_vllm_metrics_disabled(self):