From 70a0eeeb58d1c67b247d6a77fd1c2c226c87ad3c Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Thu, 11 Jul 2024 18:24:52 -0700 Subject: [PATCH] test: Tests for Metrics API enhancement to include error counters (#7423) --- docs/user_guide/metrics.md | 19 +++++++ .../lifecycle/lifecycle_test.py | 35 ++++++++++++- qa/L0_model_queue/model_queue_test.py | 50 ++++++++++++++++++- qa/L0_request_cancellation/scheduler_test.py | 43 +++++++++++++++- 4 files changed, 144 insertions(+), 3 deletions(-) diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md index 1e70bac86c..8eb26d0bf5 100644 --- a/docs/user_guide/metrics.md +++ b/docs/user_guide/metrics.md @@ -100,6 +100,25 @@ Count*. The count metrics are illustrated by the following examples: | |Execution Count |`nv_inference_exec_count` |Number of inference batch executions (see [Inference Request Metrics](#inference-request-metrics), does not include cached requests)|Per model|Per request| | |Pending Request Count |`nv_inference_pending_request_count` |Number of inference requests awaiting execution by a backend. This number is incremented when a request is enqueued to the server (`TRITONSERVER_ServerInferAsync`) and is decremented when a backend is about to start executing the request. More details can be found below. |Per model|Per request| +#### Failure Count Categories + +| Failed Request Reason |Description | +|------------|------------| +| REJECTED | Number of inference failures due to request timeout in the schedular. | +| CANCELED | Number of inference failures due to request cancellation in the core. | +| BACKEND | Number of inference failures during execution of requests in the backend/model. | +| OTHER | Number of inference failures due to other uncategorized reasons in the core. | + +> **Note** +> +> Ensemble failure metrics will reflect the failure counts of their composing models as well as the parent model, but currently do not capture the same granularity for the "reason" label and will default to the "OTHER" reason. +> +> For example, if EnsembleA contains ModelA, and ModelA experiences a failed request due to a queue/backlog timeout in the scheduler, ModelA will have a failed request metric reflecting `reason=REJECTED` and `count=1`. +> Additionally, EnsembleA will have a failed request metric reflecting `reason=OTHER` and `count=2`. +> The `count=2` reflects 1 from the internally failed request captured by ModelA, as well as 1 from the failed top-level request sent to EnsembleA by the user/client. +> The `reason=OTHER` reflects that fact that the ensemble doesn't currently capture the specific reason why +> ModelA's request failed at this time. + #### Pending Request Count (Queue Size) Per-Model The *Pending Request Count* reflects the number of requests that have been diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py index cea94a1dad..883f6d20b6 100755 --- a/qa/L0_backend_python/lifecycle/lifecycle_test.py +++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py @@ -27,8 +27,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os +import re import sys +import requests + sys.path.append("../../common") import queue @@ -63,6 +66,29 @@ class LifecycleTest(unittest.TestCase): def setUp(self): self._shm_leak_detector = shm_util.ShmLeakDetector() + def _get_metrics(self): + metrics_url = "http://localhost:8002/metrics" + r = requests.get(metrics_url) + r.raise_for_status() + return r.text + + def _metrics_before_test(self, model, reason): + pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)' + metrics = self._get_metrics() + match = re.search(pattern, metrics) + if match: + return int(match.group(1)) + else: + raise Exception(f"Failure metrics for model='{model}' not found") + + def _assert_metrics( + self, model_name, reason, expected_count_increase, initial_count + ): + metrics = self._get_metrics() + # Add initial count + expected count for the the test + expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}' + self.assertIn(expected_metric, metrics) + def test_error_code(self): model_name = "error_code" shape = [1, 1] @@ -181,7 +207,7 @@ def test_batch_error(self): def test_infer_pymodel_error(self): model_name = "wrong_model" shape = [2, 2] - + initial_metrics_value = self._metrics_before_test(model_name, "BACKEND") with self._shm_leak_detector.Probe() as shm_probe: with httpclient.InferenceServerClient( f"{_tritonserver_ipaddr}:8000" @@ -207,6 +233,13 @@ def test_infer_pymodel_error(self): self.assertTrue( False, "Wrong exception raised or did not raise an exception" ) + expected_count_increase = 1 + self._assert_metrics( + model_name, + "BACKEND", + expected_count_increase, + initial_metrics_value, + ) if __name__ == "__main__": diff --git a/qa/L0_model_queue/model_queue_test.py b/qa/L0_model_queue/model_queue_test.py index e7be471f79..025d126417 100755 --- a/qa/L0_model_queue/model_queue_test.py +++ b/qa/L0_model_queue/model_queue_test.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -30,6 +30,7 @@ sys.path.append("../common") +import re import threading import time import unittest @@ -38,6 +39,7 @@ import infer_util as iu import numpy as np +import requests import test_util as tu from tritonclientutils import InferenceServerException @@ -69,6 +71,29 @@ def check_deferred_exception(self): _deferred_exceptions.pop(0) raise first_exception + def _get_metrics(self): + metrics_url = "http://localhost:8002/metrics" + r = requests.get(metrics_url) + r.raise_for_status() + return r.text + + def _metrics_before_test(self, model, reason): + pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)' + metrics = self._get_metrics() + match = re.search(pattern, metrics) + if match: + return int(match.group(1)) + else: + raise Exception(f"Failure metrics for model='{model}' not found") + + def _assert_metrics( + self, model_name, reason, expected_count_increase, initial_count + ): + metrics = self._get_metrics() + # Add initial count + expected count for the the test + expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}' + self.assertIn(expected_metric, metrics) + def check_response( self, bs, @@ -235,6 +260,12 @@ def test_policy_reject(self): # requests are sent after 'default_timeout_microseconds'. # Expect the first request is timed-out and rejected, which makes the # second and third request be batched together and executed. + initial_metrics_value_ensemble = self._metrics_before_test( + "ensemble_zero_1_float32", "OTHER" + ) + initial_metrics_value_custom = self._metrics_before_test( + "custom_zero_1_float32", "REJECTED" + ) dtype = np.float32 shapes = ([16],) for trial in self.trials_: @@ -283,6 +314,23 @@ def test_policy_reject(self): self.check_deferred_exception() except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) + expected_count_increase = 4 + # NOTE: Ensemble failure metrics will reflect the failure counts + # of their composing models as well as the parent model, but currently do not capture the same granularity + # for the "reason" label and will default to the "OTHER" reason. + self._assert_metrics( + "ensemble_zero_1_float32", + "OTHER", + expected_count_increase, + initial_metrics_value_ensemble, + ) + expected_count_increase = 4 + self._assert_metrics( + "custom_zero_1_float32", + "REJECTED", + expected_count_increase, + initial_metrics_value_custom, + ) def test_timeout_override(self): # Send requests with batch sizes 1, 1, 3 where the first request diff --git a/qa/L0_request_cancellation/scheduler_test.py b/qa/L0_request_cancellation/scheduler_test.py index a6cd97efaa..900073ea7d 100755 --- a/qa/L0_request_cancellation/scheduler_test.py +++ b/qa/L0_request_cancellation/scheduler_test.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -27,10 +27,12 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import concurrent.futures +import re import time import unittest import numpy as np +import requests import tritonclient.grpc as grpcclient from tritonclient.utils import InferenceServerException @@ -84,6 +86,29 @@ def _assert_streaming_response_is_cancelled(self, response): cancelled_count += 1 self.assertEqual(cancelled_count, 1) + def _get_metrics(self): + metrics_url = "http://localhost:8002/metrics" + r = requests.get(metrics_url) + r.raise_for_status() + return r.text + + def _metrics_before_test(self, model, reason): + pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)' + metrics = self._get_metrics() + match = re.search(pattern, metrics) + if match: + return int(match.group(1)) + else: + raise Exception(f"Failure metrics for model='{model}' not found") + + def _assert_metrics( + self, model_name, reason, expected_count_increase, initial_count + ): + metrics = self._get_metrics() + # Add initial count + expected count for the the test + expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}' + self.assertIn(expected_metric, metrics) + # Test queued requests on dynamic batch scheduler can be cancelled def test_dynamic_batch_scheduler_request_cancellation(self): model_name = "dynamic_batch" @@ -114,6 +139,7 @@ def test_dynamic_batch_scheduler_request_cancellation(self): # Test backlogged requests on sequence batch scheduler can be cancelled def test_sequence_batch_scheduler_backlog_request_cancellation(self): model_name = "sequence_direct" + initial_metrics_value = self._metrics_before_test(model_name, "CANCELED") with concurrent.futures.ThreadPoolExecutor() as pool: # Saturate the single sequence slot saturate_thread = pool.submit( @@ -149,11 +175,26 @@ def test_sequence_batch_scheduler_backlog_request_cancellation(self): self._assert_response_is_cancelled(backlog_requests[1]["response"]) # Join saturating thread saturate_thread.result() + expected_count_increase = 2 + self._assert_metrics( + model_name, + "CANCELED", + expected_count_increase, + initial_metrics_value, + ) # Test queued requests on direct sequence batch scheduler can be cancelled def test_direct_sequence_batch_scheduler_request_cancellation(self): model_name = "sequence_direct" + initial_metrics_value = self._metrics_before_test(model_name, "CANCELED") self._test_sequence_batch_scheduler_queued_request_cancellation(model_name) + expected_count_increase = 2 + self._assert_metrics( + model_name, + "CANCELED", + expected_count_increase, + initial_metrics_value, + ) # Test queued requests on oldest sequence batch scheduler can be cancelled def test_oldest_sequence_batch_scheduler_request_cancellation(self):