Collect telemetry metrics from Triton metrics endpoint

triton-inference-server · Aug 7, 2024 · 4ab531c · 4ab531c
1 parent f5cabf8
commit 4ab531c
Show file tree

Hide file tree

Showing 7 changed files with 303 additions and 8 deletions.
diff --git a/genai-perf/genai_perf/constants.py b/genai-perf/genai_perf/constants.py
@@ -26,7 +26,7 @@
 
 DEFAULT_HTTP_URL = "localhost:8000"
 DEFAULT_GRPC_URL = "localhost:8001"
-
+DEFAULT_TRITON_METRICS_URL = "http://0.0.0.0:8002/metrics"
 
 OPEN_ORCA = "openorca"
 CNN_DAILY_MAIL = "cnn_dailymail"

diff --git a/genai-perf/genai_perf/metrics/__init__.py b/genai-perf/genai_perf/metrics/__init__.py
@@ -27,3 +27,4 @@
 from genai_perf.metrics.llm_metrics import LLMMetrics
 from genai_perf.metrics.metrics import MetricMetadata, Metrics
 from genai_perf.metrics.statistics import Statistics
+from genai_perf.metrics.telemetry_metrics import TelemetryMetrics
diff --git a/genai-perf/genai_perf/metrics/telemetry_metrics.py b/genai-perf/genai_perf/metrics/telemetry_metrics.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import List
+
+from genai_perf.metrics.metrics import MetricMetadata
+
+
+class TelemetryMetrics:
+    """
+    A class that contains common telemetry level metrics.
+    Metrics are stored as lists where each inner list corresponds to multiple measurements per GPU.
+    Each measurement is recorded every 1000 ms.
+    """
+
+    TELEMETRY_METRICS = [
+        MetricMetadata("gpu_power_usage", "watts"),
+        MetricMetadata("gpu_power_limit", "watts"),
+        MetricMetadata("energy_consumption", "joules"),
+        MetricMetadata("gpu_utilization", "percentage"),
+        MetricMetadata("total_gpu_memory", "bytes"),
+        MetricMetadata("gpu_memory_used", "bytes"),
+    ]
+
+    def __init__(
+        self,
+        gpu_power_usage: List[List[float]] = [],  # Multiple measurements per GPU
+        gpu_power_limit: List[List[float]] = [],
+        energy_consumption: List[List[float]] = [],
+        gpu_utilization: List[List[float]] = [],
+        total_gpu_memory: List[List[int]] = [],
+        gpu_memory_used: List[List[int]] = [],
+    ) -> None:
+        self.gpu_power_usage = gpu_power_usage
+        self.gpu_power_limit = gpu_power_limit
+        self.energy_consumption = energy_consumption
+        self.gpu_utilization = gpu_utilization
+        self.total_gpu_memory = total_gpu_memory
+        self.gpu_memory_used = gpu_memory_used
+
+    def __repr__(self):
+        attr_strs = []
+        for k, v in self.__dict__.items():
+            if not k.startswith("_"):
+                attr_strs.append(f"{k}={v}")
+        return f"TelemetryMetrics({','.join(attr_strs)})"
+
+    @property
+    def telemetry_metrics(self) -> List[MetricMetadata]:
+        return self.TELEMETRY_METRICS
+
+    @property
+    def data(self) -> dict:
+        """Returns all the metrics."""
+        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}
diff --git a/genai-perf/genai_perf/telemetry_data/__init__.py b/genai-perf/genai_perf/telemetry_data/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from genai_perf.telemetry_data.telemetry_data_collector import TelemetryDataCollector
diff --git a/genai-perf/genai_perf/telemetry_data/telemetry_data_collector.py b/genai-perf/genai_perf/telemetry_data/telemetry_data_collector.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import threading
+import time
+from abc import ABC, abstractmethod
+from typing import Dict, List
+
+import requests
+from genai_perf.metrics.telemetry_metrics import TelemetryMetrics
+
+
+class TelemetryDataCollector(ABC):
+    def __init__(
+        self, server_metrics_url: str, collection_interval: float = 1.0  # in seconds
+    ) -> None:
+        self._server_metrics_url = server_metrics_url
+        self._collection_interval = collection_interval
+        self._metrics = TelemetryMetrics()
+        self._stop_event = threading.Event()
+        self._thread = None
+
+    def _fetch_metrics(self) -> None:
+        """Fetch the metrics from the metrics endpoint"""
+        response = requests.get(self._server_metrics_url)
+        response.raise_for_status()
+        return response.text
+
+    @abstractmethod
+    def _parse_metrics(self) -> None:
+        """Parse metrics data. This method should be implemented by subclasses."""
+        pass
+
+    def _update_metrics(self, parsed_data) -> None:
+        for metric_name, metric_values in parsed_data.items():
+            if len(metric_values) > len(getattr(self.metrics, metric_name, [])):
+                current_values = getattr(self.metrics, metric_name, [])
+                current_values.append(metric_values)
+                setattr(self.metrics, metric_name, current_values)
+        print(self.metrics)
+
+    def _collect_metrics(self) -> None:
+        while not self._stop_event.is_set():
+            metrics_data = self._fetch_metrics()
+            parsed_data = self._parse_metrics(metrics_data)
+            self._update_metrics(parsed_data)
+
+            self.metrics.gpu_power_usage.append(parsed_data["gpu_power_usage"])
+            self.metrics.gpu_power_limit.append(parsed_data["gpu_power_limit"])
+            self.metrics.energy_consumption.append(parsed_data["energy_consumption"])
+            self.metrics.gpu_utilization.append(parsed_data["gpu_utilization"])
+            self.metrics.total_gpu_memory.append(parsed_data["total_gpu_memory"])
+            self.metrics.gpu_memory_used.append(parsed_data["gpu_memory_used"])
+
+            time.sleep(self._collection_interval)
+
+    def start(self) -> None:
+        """Start the telemetry data collection thread."""
+        if self._thread is None or not self._thread.is_alive():
+            self._stop_event.clear()
+            self._thread = threading.Thread(target=self._collect_metrics)
+            self._thread.start()
+
+    def stop(self) -> None:
+        """Stop the telemetry data collection thread."""
+        if self._thread is not None and self._thread.is_alive():
+            self._stop_event.set()
+            self._thread.join()
+
+    @property
+    def metrics(self) -> TelemetryMetrics:
+        """Return the collected metrics."""
+        return self._metrics
diff --git a/genai-perf/genai_perf/telemetry_data/triton_telemetry_data_collector.py b/genai-perf/genai_perf/telemetry_data/triton_telemetry_data_collector.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import re
+from typing import Dict, List
+
+from genai_perf.telemetry_data.telemetry_data_collector import TelemetryDataCollector
+
+
+class TritonTelemetryDataCollector(TelemetryDataCollector):
+    """Class to collect telemetry metrics from Triton server"""
+
+    def _parse_metrics(self, data: str) -> None:
+        # Parsing logic for Prometheus metrics
+        metrics = {
+            "gpu_power_usage": [],
+            "gpu_power_limit": [],
+            "energy_consumption": [],
+            "gpu_utilization": [],
+            "total_gpu_memory": [],
+            "gpu_memory_used": [],
+        }
+
+        for line in data.splitlines():
+            if line.startswith("nv_gpu_power_usage"):
+                self._extract_metric(line, metrics["gpu_power_usage"])
+            elif line.startswith("nv_gpu_power_limit"):
+                self._extract_metric(line, metrics["gpu_power_limit"])
+            elif line.startswith("nv_energy_consumption"):
+                self._extract_metric(line, metrics["energy_consumption"])
+            elif line.startswith("nv_gpu_utilization"):
+                self._extract_metric(line, metrics["gpu_utilization"])
+            elif line.startswith("nv_gpu_memory_total_bytes"):
+                self._extract_metric(line, metrics["total_gpu_memory"])
+            elif line.startswith("nv_gpu_memory_used_bytes"):
+                self._extract_metric(line, metrics["gpu_memory_used"])
+        return metrics
+
+    def _extract_metric(
+        self, metric_line: str, metric_list: List[List[float]]
+    ) -> Dict[str, List[List[float]]]:
+        metric_components = metric_line.split()
+        metric_value = float(metric_components[1])
+        metric_list.append(metric_value)
diff --git a/genai-perf/genai_perf/wrapper.py b/genai-perf/genai_perf/wrapper.py
@@ -30,8 +30,15 @@
 
 import genai_perf.logging as logging
 import genai_perf.utils as utils
-from genai_perf.constants import DEFAULT_GRPC_URL, DEFAULT_INPUT_DATA_JSON
+from genai_perf.constants import (
+    DEFAULT_GRPC_URL,
+    DEFAULT_INPUT_DATA_JSON,
+    DEFAULT_TRITON_METRICS_URL,
+)
 from genai_perf.llm_inputs.llm_inputs import OutputFormat
+from genai_perf.telemetry_data.triton_telemetry_data_collector import (
+    TritonTelemetryDataCollector,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -139,9 +146,22 @@ def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[s
 
     @staticmethod
     def run(args: Namespace, extra_args: Optional[List[str]]) -> None:
-        cmd = Profiler.build_cmd(args, extra_args)
-        logger.info(f"Running Perf Analyzer : '{' '.join(cmd)}'")
-        if args and args.verbose:
-            subprocess.run(cmd, check=True, stdout=None)
-        else:
-            subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL)
+        telemetry_data_collector = None
+        try:
+            if args.service_kind == "triton":
+                telemetry_data_collector = TritonTelemetryDataCollector(
+                    server_metrics_url=DEFAULT_TRITON_METRICS_URL
+                )
+                telemetry_data_collector.start()
+            cmd = Profiler.build_cmd(args, extra_args)
+            logger.info(f"Running Perf Analyzer : '{' '.join(cmd)}'")
+            if args and args.verbose:
+                subprocess.run(cmd, check=True, stdout=None)
+            else:
+                subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL)
+        finally:
+            if telemetry_data_collector is not None:
+                telemetry_data_collector.stop()
+                metrics = telemetry_data_collector.metrics
+                print("Collected Metrics:")
+                print(metrics)