triton-inference-server · yinggeh · Oct 23, 2024 · Oct 11, 2024 · Oct 15, 2024 · Oct 16, 2024
diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
@@ -219,7 +219,7 @@ metrics are used for latencies:
 |--------------|----------------|------------|---------------------------|-----------|-------------|-------------|
 |Latency       |Request to First Response Time    |`nv_inference_first_response_histogram_ms` |Histogram of end-to-end inference request to the first response time |Per model  |Per request  | Decoupled |
 
-To disable these metrics specifically, you can set `--metrics-config histogram_latencies=false`
+To enable these metrics specifically, you can set `--metrics-config histogram_latencies=true`
 
 Each histogram above may composed of several sub-metrics. For each
 metric, there is a set of `le` metrics tracking the counter for each

diff --git a/qa/L0_metrics/ensemble_decoupled/async_execute_decouple/1/model.py b/qa/L0_metrics/ensemble_decoupled/async_execute_decouple/1/model.py
@@ -0,0 +1,76 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import asyncio
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    async def execute(self, requests):
+        processed_requests = []
+        async_tasks = []
+        for request in requests:
+            wait_secs_tensor = pb_utils.get_input_tensor_by_name(
+                request, "WAIT_SECONDS"
+            ).as_numpy()
+            for wait_secs in wait_secs_tensor:
+                if wait_secs < 0:
+                    self.raise_value_error(requests)
+                async_tasks.append(asyncio.create_task(asyncio.sleep(wait_secs)))
+            processed_requests.append(
+                {
+                    "wait_secs": wait_secs,
+                    "response_sender": request.get_response_sender(),
+                }
+            )
+
+        # This decoupled execute should be scheduled to run in the background
+        # concurrently with other instances of decoupled execute, as long as the event
+        # loop is not blocked.
+        await asyncio.gather(*async_tasks)
+
+        for p_req in processed_requests:
+            wait_secs = p_req["wait_secs"]
+            response_sender = p_req["response_sender"]
+
+            output_tensors = pb_utils.Tensor(
+                "DUMMY_OUT", np.array([wait_secs], np.float32)
+            )
+            response = pb_utils.InferenceResponse(output_tensors=[output_tensors])
+            response_sender.send(
+                response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+            )
+
+        return None
+
+    def raise_value_error(self, requests):
+        # TODO: Model may raise exception without sending complete final
+        for request in requests:
+            response_sender = request.get_response_sender()
+            response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+        raise ValueError("wait_secs cannot be negative")
diff --git a/qa/L0_metrics/ensemble_decoupled/async_execute_decouple/config.pbtxt b/qa/L0_metrics/ensemble_decoupled/async_execute_decouple/config.pbtxt
@@ -0,0 +1,45 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+backend: "python"
+input [
+  {
+    name: "WAIT_SECONDS"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "DUMMY_OUT"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  }
+]
+
+instance_group [{ kind: KIND_CPU }]
+model_transaction_policy { decoupled: True }
+
diff --git a/qa/L0_metrics/ensemble_decoupled/ensemble/config.pbtxt b/qa/L0_metrics/ensemble_decoupled/ensemble/config.pbtxt
@@ -0,0 +1,72 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "ensemble"
+platform: "ensemble"
+input [
+ {
+  name: "INPUT"
+  data_type: TYPE_FP32
+  dims: [ 1 ]
+ }
+]
+output [
+ {
+  name: "OUTPUT"
+  data_type: TYPE_FP32
+  dims: [ 1 ]
+ }
+]
+ensemble_scheduling {
+ step [
+  {
+   # decoupled model
+   model_name: "async_execute_decouple"
+   model_version: 1
+   input_map {
+    key: "WAIT_SECONDS"
+    value: "INPUT"
+   }
+   output_map {
+    key: "DUMMY_OUT"
+    value: "temp_output"
+   }
+  },
+  {
+   # non-decoupled model
+   model_name: "async_execute"
+   model_version: 1
+   input_map {
+    key: "WAIT_SECONDS"
+    value: "temp_output"
+   }
+   output_map {
+    key: "DUMMY_OUT"
+    value: "OUTPUT"
+   }
+  }
+ ]
+}
diff --git a/qa/L0_metrics/histogram_metrics_test.py b/qa/L0_metrics/histogram_metrics_test.py
@@ -0,0 +1,168 @@
+#!/usr/bin/python
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import re
+import sys
+import time
+import unittest
+from functools import partial
+
+import numpy as np
+import requests
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import InferenceServerException
+
+MILLIS_PER_SEC = 1000
+
+
+def get_histogram_metric_key(
+    metric_family, model_name, model_version, metric_type, le=""
+):
+    if metric_type in ["count", "sum"]:
+        return f'{metric_family}_{metric_type}{{model="{model_name}",version="{model_version}"}}'
+    elif metric_type == "bucket":
+        return f'{metric_family}_{metric_type}{{model="{model_name}",version="{model_version}",le="{le}"}}'
+    else:
+        return None
+
+
+class TestHistogramMetrics(unittest.TestCase):
+    def setUp(self):
+        self.tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
+
+    def get_histogram_metrics(self, metric_family: str):
+        r = requests.get(f"http://{self.tritonserver_ipaddr}:8002/metrics")
+        r.raise_for_status()
+
+        # Regular expression to match the pattern
+        pattern = f"^{metric_family}.*"
+        histogram_dict = {}
+
+        # Find all matches in the text
+        matches = re.findall(pattern, r.text, re.MULTILINE)
+
+        for match in matches:
+            key, value = match.rsplit(" ")
+            histogram_dict[key] = int(value)
+
+        return histogram_dict
+
+    def async_stream_infer(self, model_name, inputs, outputs):
+        try:
+            triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
+        except Exception as e:
+            print("context creation failed: " + str(e))
+            sys.exit()
+
+        # Define the callback function. Note the last two parameters should be
+        # result and error. InferenceServerClient would povide the results of an
+        # inference as grpcclient.InferResult in result. For successful
+        # inference, error will be None, otherwise it will be an object of
+        # tritonclientutils.InferenceServerException holding the error details
+        def callback(user_data, result, error):
+            if error:
+                user_data.append(error)
+            else:
+                user_data.append(result)
+
+        # list to hold the results of inference.
+        user_data = []
+
+        # Inference call
+        triton_client.start_stream(callback=partial(callback, user_data))
+        triton_client.async_stream_infer(
+            model_name=model_name,
+            inputs=inputs,
+            outputs=outputs,
+        )
+        triton_client.stop_stream()
+
+        # Wait until the results are available in user_data
+        time_out = 10
+        while (len(user_data) == 0) and time_out > 0:
+            time_out = time_out - 1
+            time.sleep(1)
+
+        # Display and validate the available results
+        if len(user_data) == 1:
+            # Check for the errors
+            if type(user_data[0]) == InferenceServerException:
+                print(user_data[0])
+                sys.exit(1)
+
+    def test_ensemble_decoupled(self):
+        ensemble_model_name = "ensemble"
+        wait_secs = 1
+
+        # Infer
+        inputs = []
+        outputs = []
+        inputs.append(grpcclient.InferInput("INPUT", [1], "FP32"))
+        outputs.append(grpcclient.InferRequestedOutput("OUTPUT"))
+
+        # Create the data for the input tensor. Initialize to all ones.
+        input_data = np.ones(shape=(1), dtype=np.float32) * wait_secs
+        # Initialize the data
+        inputs[0].set_data_from_numpy(input_data)
+
+        self.async_stream_infer(ensemble_model_name, inputs, outputs)
+
+        # Checks metrics output
+        first_response_family = "nv_inference_first_response_histogram_ms"
+        decoupled_model_name = "async_execute_decouple"
+        histogram_dict = self.get_histogram_metrics(first_response_family)
+
+        ensemble_model_count = get_histogram_metric_key(
+            first_response_family, ensemble_model_name, "1", "count"
+        )
+        ensemble_model_sum = get_histogram_metric_key(
+            first_response_family, ensemble_model_name, "1", "sum"
+        )
+        self.assertIn(ensemble_model_count, histogram_dict)
+        self.assertGreaterEqual(histogram_dict[ensemble_model_count], 1)
+        self.assertIn(ensemble_model_sum, histogram_dict)
+        self.assertGreaterEqual(
+            histogram_dict[ensemble_model_sum], 2 * wait_secs * MILLIS_PER_SEC
+        )
+
+        decoupled_model_count = get_histogram_metric_key(
+            first_response_family, decoupled_model_name, "1", "count"
+        )
+        decoupled_model_sum = get_histogram_metric_key(
+            first_response_family, decoupled_model_name, "1", "sum"
+        )
+        self.assertIn(decoupled_model_count, histogram_dict)
+        self.assertGreaterEqual(histogram_dict[decoupled_model_count], 1)
+        self.assertIn(decoupled_model_sum, histogram_dict)
+        self.assertGreaterEqual(
+            histogram_dict[decoupled_model_sum], wait_secs * MILLIS_PER_SEC
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()