From 5f10d61aeaaada99c93b31d179f5a69274af115a Mon Sep 17 00:00:00 2001
From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com>
Date: Fri, 5 Jul 2024 14:50:14 -0700
Subject: [PATCH 01/10] [refactor]: Refactor Frontend Trace OpenTelemetry
 Implementation (#7390)

Co-authored-by: Iman Tabrizian <iman.tabrizian@gmail.com>
---
 src/tracer.cc | 150 +++++++++++++++++++-------------------------------
 src/tracer.h  |  77 ++++++++++++--------------
 2 files changed, 92 insertions(+), 135 deletions(-)

diff --git a/src/tracer.cc b/src/tracer.cc
index 560278e4a6..b17f5eb7e8 100644
--- a/src/tracer.cc
+++ b/src/tracer.cc
@@ -335,13 +335,23 @@ TraceManager::SampleTrace(const TraceStartOptions& start_options)
           std::chrono::duration_cast<std::chrono::nanoseconds>(
               std::chrono::steady_clock::now().time_since_epoch())
               .count();
-      ts->otel_context_ = start_options.propagated_context;
-      opentelemetry::nostd::shared_ptr<otel_trace_api::Span> root_span;
-      root_span = ts->StartSpan(
-          "InferRequest", steady_timestamp_ns, otel_trace_api::kSpanKey);
+      if (ts->span_stacks_.find(ts->trace_id_) == ts->span_stacks_.end()) {
+        std::unique_ptr<
+            std::stack<opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>>
+            st(new std::stack<
+                opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>());
+        ts->span_stacks_.emplace(ts->trace_id_, std::move(st));
+      }
+      auto active_span =
+          otel_trace_api::GetSpan(start_options.propagated_context);
+      if (active_span->GetContext().IsValid()) {
+        ts->span_stacks_[ts->trace_id_]->emplace(active_span);
+      }
       // Storing "InferRequest" span as a root span
       // to keep it alive for the duration of the request.
-      ts->otel_context_ = ts->otel_context_.SetValue(kRootSpan, root_span);
+      ts->root_span_ =
+          ts->StartSpan("InferRequest", steady_timestamp_ns, ts->trace_id_);
+      ts->span_stacks_[ts->trace_id_]->emplace(ts->root_span_);
 #else
       LOG_ERROR << "Unsupported trace mode: "
                 << TraceManager::InferenceTraceModeString(ts->setting_->mode_);
@@ -358,7 +368,7 @@ TraceManager::Trace::~Trace()
     setting_->WriteTrace(streams_);
   } else if (setting_->mode_ == TRACE_MODE_OPENTELEMETRY) {
 #ifndef _WIN32
-    EndSpan(kRootSpan);
+    EndSpan(trace_id_);
 #else
     LOG_ERROR << "Unsupported trace mode: "
               << TraceManager::InferenceTraceModeString(setting_->mode_);
@@ -390,7 +400,8 @@ TraceManager::Trace::CaptureTimestamp(
           << "{\"name\":\"" << name << "\",\"ns\":" << timestamp_ns << "}]}";
     } else if (setting_->mode_ == TRACE_MODE_OPENTELEMETRY) {
 #ifndef _WIN32
-      AddEvent(kRootSpan, name, timestamp_ns);
+      root_span_->AddEvent(
+          name, time_offset_ + std::chrono::nanoseconds{timestamp_ns});
 #else
       LOG_ERROR << "Unsupported trace mode: "
                 << TraceManager::InferenceTraceModeString(setting_->mode_);
@@ -501,7 +512,7 @@ TraceManager::ProcessOpenTelemetryParameters(
 
 void
 TraceManager::Trace::StartSpan(
-    std::string span_key, TRITONSERVER_InferenceTrace* trace,
+    TRITONSERVER_InferenceTrace* trace,
     TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
     uint64_t trace_id)
 {
@@ -509,7 +520,7 @@ TraceManager::Trace::StartSpan(
   LOG_TRITONSERVER_ERROR(
       TRITONSERVER_InferenceTraceParentId(trace, &parent_id),
       "getting trace parent id");
-  std::string parent_span_key = "";
+  auto span_parent_id = parent_id;
 
   // Currently, only 2 types of sub-spans are supported:
   // request span and compute span. Compute span is a leaf span
@@ -521,16 +532,9 @@ TraceManager::Trace::StartSpan(
   // If parent_id > 0, then this is a child trace, spawned from
   // the ensamble's main request. For this instance, the parent
   // span is the ensembles's request span.
-  if (parent_id == 0 && activity == TRITONSERVER_TRACE_REQUEST_START) {
-    parent_span_key = kRootSpan;
-  } else if (activity == TRITONSERVER_TRACE_REQUEST_START) {
-    // [FIXME] For BLS requests parent span for children's request spans
-    // should be parent model's compute span. Currently,
-    // this won't work, since parent's compute span will be created
-    // only after children's spans are created.
-    parent_span_key = kRequestSpan + std::to_string(parent_id);
-  } else if (activity == TRITONSERVER_TRACE_COMPUTE_START) {
-    parent_span_key = kRequestSpan + std::to_string(trace_id);
+  if ((parent_id == 0 && activity == TRITONSERVER_TRACE_REQUEST_START) ||
+      (activity == TRITONSERVER_TRACE_COMPUTE_START)) {
+    span_parent_id = trace_id;
   }
 
   std::string display_name = "compute";
@@ -542,7 +546,7 @@ TraceManager::Trace::StartSpan(
     display_name = model_name;
   }
 
-  auto span = StartSpan(display_name, timestamp_ns, parent_span_key);
+  auto span = StartSpan(display_name, timestamp_ns, span_parent_id);
 
   if (activity == TRITONSERVER_TRACE_REQUEST_START) {
     int64_t model_version;
@@ -564,14 +568,13 @@ TraceManager::Trace::StartSpan(
     PrepareTraceContext(span, &buffer);
     TRITONSERVER_InferenceTraceSetContext(trace, buffer.Contents().c_str());
   }
-
-  otel_context_ = otel_context_.SetValue(span_key, span);
+  span_stacks_[trace_id]->emplace(span);
 }
 
 opentelemetry::nostd::shared_ptr<otel_trace_api::Span>
 TraceManager::Trace::StartSpan(
     std::string display_name, const uint64_t& raw_timestamp_ns,
-    std::string parent_span_key)
+    uint64_t trace_id)
 {
   otel_trace_api::StartSpanOptions options;
   options.kind = otel_trace_api::SpanKind::kServer;
@@ -580,45 +583,37 @@ TraceManager::Trace::StartSpan(
   options.start_steady_time =
       otel_common::SteadyTimestamp{std::chrono::nanoseconds{raw_timestamp_ns}};
 
-  // If the new span is a child span, we need to retrieve its parent from
-  // the context and provide it through StartSpanOptions to the child span
-  if (!parent_span_key.empty() && otel_context_.HasKey(parent_span_key)) {
-    auto parent_span = opentelemetry::nostd::get<
-        opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>(
-        otel_context_.GetValue(parent_span_key));
-    options.parent = parent_span->GetContext();
+  // If the new span is a child span, we need to retrieve its parent and
+  // provide it through StartSpanOptions to the child span
+  if (span_stacks_.find(trace_id) != span_stacks_.end() &&
+      !span_stacks_[trace_id]->empty()) {
+    options.parent = span_stacks_[trace_id]->top()->GetContext();
   }
   auto provider = opentelemetry::trace::Provider::GetTracerProvider();
   return provider->GetTracer(kTritonTracer)->StartSpan(display_name, options);
 }
 
 void
-TraceManager::Trace::EndSpan(std::string span_key)
+TraceManager::Trace::EndSpan(uint64_t trace_id)
 {
   auto timestamp_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
                           std::chrono::steady_clock::now().time_since_epoch())
                           .count();
-  EndSpan(span_key, timestamp_ns);
+  EndSpan(timestamp_ns, trace_id);
 }
 
 
 void
 TraceManager::Trace::EndSpan(
-    std::string span_key, const uint64_t& raw_timestamp_ns)
+    const uint64_t& raw_timestamp_ns, uint64_t trace_id)
 {
-  if (otel_context_.HasKey(span_key)) {
-    auto span = opentelemetry::nostd::get<
-        opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>(
-        otel_context_.GetValue(span_key));
-
-    if (span == nullptr) {
-      return;
-    }
-
+  if (span_stacks_.find(trace_id) != span_stacks_.end() &&
+      !span_stacks_[trace_id]->empty()) {
     otel_trace_api::EndSpanOptions end_options;
     end_options.end_steady_time = otel_common::SteadyTimestamp{
         std::chrono::nanoseconds{raw_timestamp_ns}};
-    span->End(end_options);
+    span_stacks_[trace_id]->top()->End(end_options);
+    span_stacks_[trace_id]->pop();
   }
 }
 
@@ -630,79 +625,46 @@ TraceManager::Trace::ReportToOpenTelemetry(
   uint64_t id;
   LOG_TRITONSERVER_ERROR(
       TRITONSERVER_InferenceTraceId(trace, &id), "getting trace id");
-
-  auto current_span_key = GetSpanKeyForActivity(activity, id);
-  if (current_span_key.empty()) {
-    return;
+  if (span_stacks_.find(id) == span_stacks_.end()) {
+    std::unique_ptr<
+        std::stack<opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>>
+        st(new std::stack<
+            opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>());
+    span_stacks_.emplace(id, std::move(st));
   }
 
-  AddEvent(current_span_key, trace, activity, timestamp_ns, id);
-}
-
-std::string
-TraceManager::Trace::GetSpanKeyForActivity(
-    TRITONSERVER_InferenceTraceActivity activity, uint64_t trace_id)
-{
-  std::string span_name;
-  switch (activity) {
-    case TRITONSERVER_TRACE_REQUEST_START:
-    case TRITONSERVER_TRACE_QUEUE_START:
-    case TRITONSERVER_TRACE_REQUEST_END: {
-      span_name = kRequestSpan + std::to_string(trace_id);
-      break;
-    }
-
-    case TRITONSERVER_TRACE_COMPUTE_START:
-    case TRITONSERVER_TRACE_COMPUTE_INPUT_END:
-    case TRITONSERVER_TRACE_COMPUTE_OUTPUT_START:
-    case TRITONSERVER_TRACE_COMPUTE_END: {
-      span_name = kComputeSpan + std::to_string(trace_id);
-      break;
-    }
-    case TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT:
-    case TRITONSERVER_TRACE_TENSOR_BACKEND_INPUT:
-    case TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT:
-    default: {
-      LOG_ERROR << "Unsupported activity: "
-                << TRITONSERVER_InferenceTraceActivityString(activity);
-      span_name = "";
-      break;
-    }
-  }
-
-  return span_name;
+  AddEvent(trace, activity, timestamp_ns, id);
 }
 
 void
 TraceManager::Trace::AddEvent(
-    std::string span_key, TRITONSERVER_InferenceTrace* trace,
+    TRITONSERVER_InferenceTrace* trace,
     TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
-    uint64_t id)
+    uint64_t trace_id)
 {
   if (activity == TRITONSERVER_TRACE_REQUEST_START ||
       activity == TRITONSERVER_TRACE_COMPUTE_START) {
-    StartSpan(span_key, trace, activity, timestamp_ns, id);
+    StartSpan(trace, activity, timestamp_ns, trace_id);
   }
 
   AddEvent(
-      span_key, TRITONSERVER_InferenceTraceActivityString(activity),
-      timestamp_ns);
+      TRITONSERVER_InferenceTraceActivityString(activity), timestamp_ns,
+      trace_id);
 
   if (activity == TRITONSERVER_TRACE_REQUEST_END ||
       activity == TRITONSERVER_TRACE_COMPUTE_END) {
-    EndSpan(span_key, timestamp_ns);
+    EndSpan(timestamp_ns, trace_id);
   }
 }
 
 void
 TraceManager::Trace::AddEvent(
-    std::string span_key, std::string event, uint64_t timestamp)
+    const std::string& event, uint64_t timestamp, uint64_t trace_id)
 {
-  if (otel_context_.HasKey(span_key)) {
-    auto span = opentelemetry::nostd::get<
-        opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>(
-        otel_context_.GetValue(span_key));
-    span->AddEvent(event, time_offset_ + std::chrono::nanoseconds{timestamp});
+  if (span_stacks_.find(trace_id) != span_stacks_.end() &&
+      !span_stacks_[trace_id]->empty()) {
+    span_stacks_[trace_id]->top()->AddEvent(
+        event, time_offset_ + std::chrono::nanoseconds{timestamp});
   }
 }
 
diff --git a/src/tracer.h b/src/tracer.h
index d6e8ee65b0..e33b16dbcb 100644
--- a/src/tracer.h
+++ b/src/tracer.h
@@ -32,6 +32,7 @@
 #include <mutex>
 #include <set>
 #include <sstream>
+#include <stack>
 #include <string>
 #include <unordered_map>
 #include <variant>
@@ -237,8 +238,8 @@ class TraceManager {
     /// `TRITONSERVER_TRACE_COMPUTE_START`,
     /// it starts a new request or compute span. For the request span it
     /// adds some triton related attributes, and adds this span to
-    /// `otel_context_`. Alternatively, if activity is
-    /// `TRITONSERVER_TRACE_REQUEST_END` or
+    /// a span stack, corresponding to the current trace. Alternatively,
+    /// if activity is `TRITONSERVER_TRACE_REQUEST_END` or
     /// `TRITONSERVER_TRACE_COMPUTE_END`, it ends the corresponding span.
     ///
     /// \param trace TRITONSERVER_InferenceTrace instance.
@@ -258,16 +259,25 @@ class TraceManager {
     /// OpenTelemetry SystemTimestamp to display span on a timeline, and
     /// OpenTelemetry SteadyTimestamp to calculate the duration on the span
     /// with better precision.
-    /// \param parent_span_key A span key, to find a parent span in the
-    /// OpenTelemetry context. If empty, a root span will be started,
-    /// i.e. with no parent span specified.
+    /// \param trace_id Trace id.
     /// \return A shared pointer to a newly created OpenTelemetry span.
     opentelemetry::nostd::shared_ptr<otel_trace_api::Span> StartSpan(
         std::string display_name, const uint64_t& raw_timestamp_ns,
-        std::string parent_span_key = "");
+        uint64_t trace_id);
+
+    // A map to hold spans. Any trace can spawn any amount of child traces,
+    // e.g. ensemble model and BLS. This map holds
+    // ( trace id, stack of started spans ) pair and for each trase keeps
+    // started spans alive for the duration of the traced
+    // event and helps to preserve parent-child relationship.
+    std::unordered_map<
+        uint64_t, std::unique_ptr<std::stack<
+                      opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>>>
+        span_stacks_;
 
-    // OTel context to store spans, created in the current trace
-    opentelemetry::context::Context otel_context_;
+    // Root span. Some events should be recorded in the root span, while
+    // request span is still alive and present in the stack.
+    opentelemetry::nostd::shared_ptr<otel_trace_api::Span> root_span_;
 
     /// Prepares trace context to propagate to TRITONSERVER_InferenceTrace.
     /// Trace context follows W3C Trace Context specification.
@@ -304,8 +314,6 @@ class TraceManager {
     /// For request spans, it will add the following attributes to the span:
     /// `model_name`, `model_version`, `trace_id`, `parent_id`.
     ///
-    /// \param span_key Span's key to retrieve the corresponding span from the
-    /// OpenTelemetry context.
     /// \param trace TRITONSERVER_InferenceTrace, used to request model's name,
     /// version, trace parent_id from the backend.
     /// \param activity Trace activity.
@@ -315,58 +323,45 @@ class TraceManager {
     /// with better precision.
     /// \param trace_id Trace id.
     void StartSpan(
-        std::string span_key, TRITONSERVER_InferenceTrace* trace,
+        TRITONSERVER_InferenceTrace* trace,
         TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
         uint64_t trace_id);
 
-    /// Ends the provided span.
+    /// Ends the span on the top of the stack, related to trace with `trace_id`.
     ///
-    /// \param span_key Span's key to retrieve the corresponding span from the
-    /// OpenTelemetry context.
-    void EndSpan(std::string span_key);
+    /// \param trace_id Trace id.
+    void EndSpan(uint64_t trace_id);
 
-    /// Ends the provided span at specified steady timestamp.
+    /// Ends the span on the top of the stack, related to trace with `trace_id`
+    /// at specified steady timestamp.
     ///
-    /// \param span_key Span's key to retrieve the corresponding span from the
-    /// OpenTelemetry context.
     /// \param raw_timestamp_ns Steady timestamp to use as
     /// `EndSpanOptions::end_steady_time`.
-    void EndSpan(std::string span_key, const uint64_t& raw_timestamp_ns);
-
-    /// Returns the span key, for which the activity belongs.
-    ///
-    /// \param activity reported activity.
     /// \param trace_id Trace id.
-    /// \return A key to identify span, stored in the OpenTelemetry context.
-    std::string GetSpanKeyForActivity(
-        TRITONSERVER_InferenceTraceActivity activity, uint64_t trace_id);
-
-    /// Adds event to the span, which is retrieved from OpenTelemetry context
-    /// with the provided `span_key`. If activity is
-    /// TRITONSERVER_TRACE_REQUEST_START, or TRITONSERVER_TRACE_COMPUTE_START,
-    /// starts a new span and adds it to `otel_context_`.
+    void EndSpan(const uint64_t& raw_timestamp_ns, uint64_t trace_id);
+
+    /// Adds an event to the span on the top of the stack, related to trace
+    /// with `trace_id`. If activity is TRITONSERVER_TRACE_REQUEST_START,
+    /// or TRITONSERVER_TRACE_COMPUTE_START, starts a new span and adds it
+    /// to the span's stack.
     ///
-    /// \param span_key Span's key to retrieve the corresponding span from the
-    /// OpenTelemetry context.
     /// \param trace TRITONSERVER_InferenceTrace, used to request model's name,
     /// version, trace parent_id from the backend.
     /// \param activity Trace activity.
     /// \param timestamp_ns Timestamp of the provided event.
-    /// \param id Trace id.
+    /// \param trace_id Trace id.
     void AddEvent(
-        std::string span_key, TRITONSERVER_InferenceTrace* trace,
+        TRITONSERVER_InferenceTrace* trace,
         TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
-        uint64_t id);
+        uint64_t trace_id);
 
-    /// Adds event to the OpenTelemetry span, retrieved from an OpenTelementry
-    /// context with the provided `span_key`.
+    /// Adds an event to the OpenTelemetry span.
     ///
-    /// \param span_key Span's key to retrieve the corresponding span from the
-    /// OpenTelemetry context.
     /// \param event An event to add to the span.
     /// \param timestamp_ns Timestamp of the provided event.
+    /// \param trace_id Trace id.
     void AddEvent(
-        std::string span_key, std::string event, uint64_t timestamp_ns);
+        const std::string& event, uint64_t timestamp_ns, uint64_t trace_id);
 #endif
   };
 

From 532ebe350150e6db07b584e0c0cfb4da1008a970 Mon Sep 17 00:00:00 2001
From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com>
Date: Fri, 5 Jul 2024 15:34:01 -0700
Subject: [PATCH 02/10] [fix]: grpc state cleanup fix (#7409)

---
 qa/L0_grpc_state_cleanup/cleanup_test.py | 32 +++++++++++++++++-------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/qa/L0_grpc_state_cleanup/cleanup_test.py b/qa/L0_grpc_state_cleanup/cleanup_test.py
index 28d837e571..431eeb1720 100755
--- a/qa/L0_grpc_state_cleanup/cleanup_test.py
+++ b/qa/L0_grpc_state_cleanup/cleanup_test.py
@@ -437,10 +437,10 @@ def test_simple_infer_error_status(self):
 
     def test_simple_infer_shutdownserver(self):
         # This test case is used to check whether all the state objects are
-        # released when the server is interrupted to shutdown in middle of
-        # inference run with final parameters being returned.
+        # released when the server is interrupted to shutdown in the beginning
+        # of inference run with final parameters being returned.
         with self.assertRaises(InferenceServerException) as cm:
-            self._simple_infer(request_count=10, kill_server=5)
+            self._simple_infer(request_count=20, kill_server=5)
 
     ###
     ### Streaming Tests
@@ -469,11 +469,18 @@ def test_streaming_timeout(self):
     def test_streaming_error_status(self):
         # This test case is used to check whether all the state objects are
         # released when RPC runs into error.
+        expected_exceptions = [
+            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
+            "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.",
+        ]
         with self.assertRaises(InferenceServerException) as cm:
             self._streaming_infer(request_count=10, should_error=True)
-        self.assertIn(
-            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
-            str(cm.exception),
+
+        exception_match = False
+        for expected_exception in expected_exceptions:
+            exception_match |= expected_exception in str(cm.exception)
+        self.assertTrue(
+            exception_match, "Raised unexpected exception {}".format(str(cm.exception))
         )
 
     def test_streaming_infer_shutdownserver(self):
@@ -520,11 +527,18 @@ def test_decoupled_timeout(self):
     def test_decoupled_error_status(self):
         # This test case is used to check whether all the state objects are
         # released when RPC runs into error.
+        expected_exceptions = [
+            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
+            "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.",
+        ]
         with self.assertRaises(InferenceServerException) as cm:
             self._decoupled_infer(request_count=10, repeat_count=10, should_error=True)
-        self.assertIn(
-            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
-            str(cm.exception),
+
+        exception_match = False
+        for expected_exception in expected_exceptions:
+            exception_match |= expected_exception in str(cm.exception)
+        self.assertTrue(
+            exception_match, "Raised unexpected exception {}".format(str(cm.exception))
         )
 
     def test_decoupled_infer_shutdownserver(self):

From 02723f8c9689da259f160331575b02c058ea110b Mon Sep 17 00:00:00 2001
From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com>
Date: Fri, 5 Jul 2024 15:38:31 -0700
Subject: [PATCH 03/10] [build]: vllm version update (#7405)

---
 build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.py b/build.py
index aeb2442763..b9e68d45cc 100755
--- a/build.py
+++ b/build.py
@@ -76,7 +76,7 @@
         "2024.0.0",  # ORT OpenVINO
         "2024.0.0",  # Standalone OpenVINO
         "3.2.6",  # DCGM version
-        "0.4.3",  # vLLM version
+        "0.5.0.post1",  # vLLM version
     )
 }
 

From f5273eeec87ef0bbb9f57f8ceb58f772bc6ac346 Mon Sep 17 00:00:00 2001
From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com>
Date: Fri, 5 Jul 2024 16:29:36 -0700
Subject: [PATCH 04/10] [feat]:Custom Backend Tracing (#7403)

Co-authored-by: Iman Tabrizian <itabrizian@nvidia.com>
Co-authored-by: Kris Hung <krish@nvidia.com>
---
 docs/user_guide/trace.md              |  16 ++++
 qa/L0_trace/opentelemetry_unittest.py | 105 ++++++++++++++++++++++++++
 qa/L0_trace/test.sh                   |  79 ++++++++++++++++++-
 src/tracer.cc                         |  83 ++++++++++++++------
 src/tracer.h                          |  16 +++-
 5 files changed, 273 insertions(+), 26 deletions(-)

diff --git a/docs/user_guide/trace.md b/docs/user_guide/trace.md
index d359299499..8f7708665b 100644
--- a/docs/user_guide/trace.md
+++ b/docs/user_guide/trace.md
@@ -623,6 +623,22 @@ Then, you can specify headers in the `infer` method. For references, please
 look at our [tests](https://github.com/triton-inference-server/server/blob/main/qa/L0_trace/opentelemetry_unittest.py),
 e.g. [http context propagation test](https://github.com/triton-inference-server/server/blob/main/qa/L0_trace/opentelemetry_unittest.py#L494-L508).
 
+### Custom Backend Tracing
+
+In the case when a custom activity needs to be traced in the backend, please
+use `TRITONSERVER_InferenceTraceReportActivity` API. For examples, please
+refer to the [identity backend](https://github.com/triton-inference-server/identity_backend/blob/main/src/identity.cc).
+
+In `openTelemetry` trace mode, if one wishes to start a new span, make sure
+that the name of your custom activity ends with `_START`. To end the new span,
+make sure that corresponding activity ends with `_END`. For example, in the
+identity backend, we start a `CUSTOM_ACTIVITY` span, by [reporting](https://github.com/triton-inference-server/identity_backend/blob/oandreeva-custom-trace-activity/src/identity.cc#L872-L876)
+`CUSTOM_ACTIVITY_START` event; and we close this span by [reporting](https://github.com/triton-inference-server/identity_backend/blob/oandreeva-custom-trace-activity/src/identity.cc#L880-L883)
+`CUSTOM_ACTIVITY_END` event.
+
+Please note, that it is user's responsibility to make sure that all custom started
+spans are properly ended.
+
 ### Limitations
 
 - OpenTelemetry trace mode is not supported on Windows systems.
diff --git a/qa/L0_trace/opentelemetry_unittest.py b/qa/L0_trace/opentelemetry_unittest.py
index 93056e613d..34dc0bfd88 100644
--- a/qa/L0_trace/opentelemetry_unittest.py
+++ b/qa/L0_trace/opentelemetry_unittest.py
@@ -115,12 +115,14 @@ def setUp(self):
         self.bls_model_name = "bls_simple"
         self.trace_context_model = "trace_context"
         self.non_decoupled_model_name_ = "repeat_int32"
+        self.identity_model = "custom_identity_int32"
         self.test_models = [
             self.simple_model_name,
             self.ensemble_model_name,
             self.bls_model_name,
             self.non_decoupled_model_name_,
             self.cancel_queue_model_name,
+            self.identity_model,
         ]
         self.root_span = "InferRequest"
         self._user_data = UserData()
@@ -219,6 +221,7 @@ def _check_events(self, span_name, events, is_cancelled):
             self.assertFalse(
                 all(entry in events for entry in root_events_http + root_events_grpc)
             )
+            self.assertEquals(len(events), len(compute_events))
 
         elif span_name == self.root_span:
             # Check that root span has INFER_RESPONSE_COMPLETE, _RECV/_WAITREAD
@@ -230,16 +233,20 @@ def _check_events(self, span_name, events, is_cancelled):
             if "HTTP" in events:
                 self.assertTrue(all(entry in events for entry in root_events_http))
                 self.assertFalse(all(entry in events for entry in root_events_grpc))
+                self.assertEquals(len(events), len(root_events_http))
 
             elif "GRPC" in events:
                 self.assertTrue(all(entry in events for entry in root_events_grpc))
                 self.assertFalse(all(entry in events for entry in root_events_http))
+                self.assertEquals(len(events), len(root_events_grpc))
 
             if is_cancelled == False:
                 self.assertFalse(all(entry in events for entry in request_events))
                 self.assertFalse(all(entry in events for entry in compute_events))
 
         elif span_name in self.test_models:
+            if span_name == self.identity_model:
+                request_events.append("CUSTOM_SINGLE_ACTIVITY")
             # Check that all request related events (and only them)
             # are recorded in request span
             self.assertTrue(all(entry in events for entry in request_events))
@@ -247,6 +254,31 @@ def _check_events(self, span_name, events, is_cancelled):
                 all(entry in events for entry in root_events_http + root_events_grpc)
             )
             self.assertFalse(all(entry in events for entry in compute_events))
+            self.assertEquals(len(events), len(request_events))
+
+        elif span_name.startswith("CUSTOM_ACTIVITY"):
+            custom_activity_events = []
+            if len(span_name) > len("CUSTOM_ACTIVITY"):
+                custom_activity_events.append(str(span_name + "_START"))
+                custom_activity_events.append(str(span_name + "_END"))
+                # Check `custom_identity_int32` config file,
+                # parameter `single_activity_frequency` identifies
+                # which custom spans contain "CUSTOM_SINGLE_ACTIVITY" event
+                if int(span_name[-1]) % 3 == 0:
+                    custom_activity_events.append("CUSTOM_SINGLE_ACTIVITY")
+            else:
+                custom_activity_events = [
+                    "CUSTOM_ACTIVITY_START",
+                    "CUSTOM_ACTIVITY_END",
+                ]
+
+            self.assertTrue(
+                all(entry in events for entry in custom_activity_events),
+                "Span " + span_name,
+            )
+            self.assertEquals(
+                len(events), len(custom_activity_events), "Span " + span_name
+            )
 
     def _test_resource_attributes(self, attributes):
         """
@@ -487,6 +519,52 @@ def _test_simple_trace(self, headers=None):
             expected_parent_span_dict=expected_parent_span_dict,
         )
 
+    def _test_custom_identity_trace(self, headers=None):
+        """
+        Helper function, that specifies expected parameters to evaluate trace,
+        collected from running 1 inference request for `custom_identity_int32`
+        model.
+        Number of custom spans defined by the identity backend.
+        `CUSTOM_ACTIVITY` span will always be there,
+        `CUSTOM_ACTIVITY<N>` defined by `config.pbtxt parameters`.
+        """
+        expected_number_of_spans = 10
+        expected_counts = dict(
+            {
+                "compute": 1,
+                self.identity_model: 1,
+                self.root_span: 1,
+                "CUSTOM_ACTIVITY": 1,
+                "CUSTOM_ACTIVITY0": 1,
+                "CUSTOM_ACTIVITY1": 1,
+                "CUSTOM_ACTIVITY2": 1,
+                "CUSTOM_ACTIVITY3": 1,
+                "CUSTOM_ACTIVITY4": 1,
+                "CUSTOM_ACTIVITY5": 1,
+            }
+        )
+        expected_parent_span_dict = dict(
+            {
+                "InferRequest": ["custom_identity_int32"],
+                "custom_identity_int32": [
+                    "CUSTOM_ACTIVITY",
+                    "CUSTOM_ACTIVITY0",
+                    "compute",
+                ],
+                "CUSTOM_ACTIVITY0": ["CUSTOM_ACTIVITY1"],
+                "CUSTOM_ACTIVITY1": ["CUSTOM_ACTIVITY2"],
+                "CUSTOM_ACTIVITY2": ["CUSTOM_ACTIVITY3"],
+                "CUSTOM_ACTIVITY3": ["CUSTOM_ACTIVITY4"],
+                "CUSTOM_ACTIVITY4": ["CUSTOM_ACTIVITY5"],
+            }
+        )
+        self._test_trace(
+            headers=headers,
+            expected_number_of_spans=expected_number_of_spans,
+            expected_counts=expected_counts,
+            expected_parent_span_dict=expected_parent_span_dict,
+        )
+
     def _test_non_decoupled_trace(self, headers=None):
         """
         Helper function, that collects trace for non decoupled model and verifies it.
@@ -944,6 +1022,33 @@ def test_trace_context_exposed_to_pbe(self):
         context_pattern = re.compile(r"\d{2}-[0-9a-f]{32}-[0-9a-f]{16}-\d{2}")
         self.assertIsNotNone(re.match(context_pattern, context["traceparent"]))
 
+    def test_custom_backend_tracing(self):
+        """
+        Tests custom activities reported from identity backend.
+        """
+        input0_ = np.array([[4]], dtype=np.int32)
+        with httpclient.InferenceServerClient("localhost:8000", verbose=True) as client:
+            inputs = []
+            inputs.append(httpclient.InferInput("INPUT0", [1, 1], "INT32"))
+            inputs[0].set_data_from_numpy(input0_)
+            client.infer(self.identity_model, inputs=inputs)
+        self._test_custom_identity_trace()
+
+    def test_custom_backend_tracing_context_propagation(self):
+        """
+        Tests custom activities reported from identity backend.
+        """
+        input0_ = np.array([[4]], dtype=np.int32)
+        with httpclient.InferenceServerClient("localhost:8000", verbose=True) as client:
+            inputs = []
+            inputs.append(httpclient.InferInput("INPUT0", [1, 1], "INT32"))
+            inputs[0].set_data_from_numpy(input0_)
+            client.infer(
+                self.identity_model, inputs=inputs, headers=self.client_headers
+            )
+
+        self._test_custom_identity_trace(headers=self.client_headers)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_trace/test.sh b/qa/L0_trace/test.sh
index 7d67afb3ba..d2943c1996 100755
--- a/qa/L0_trace/test.sh
+++ b/qa/L0_trace/test.sh
@@ -97,6 +97,16 @@ cp -r $DATADIR/$MODELBASE $MODELSDIR/simple && \
 cp -r ../L0_decoupled/models/repeat_int32 $MODELSDIR
 sed -i "s/decoupled: True/decoupled: False/" $MODELSDIR/repeat_int32/config.pbtxt
 
+# set up identity model
+mkdir -p $MODELSDIR/custom_identity_int32/1 && (cd $MODELSDIR/custom_identity_int32 && \
+    echo 'name: "custom_identity_int32"' >> config.pbtxt && \
+    echo 'backend: "identity"' >> config.pbtxt && \
+    echo 'max_batch_size: 1024' >> config.pbtxt && \
+    echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_INT32 \n dims: [ -1 ] }]' >> config.pbtxt && \
+    echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_INT32 \n dims: [ -1 ] }]' >> config.pbtxt && \
+    echo 'instance_group [{ kind: KIND_CPU }]' >> config.pbtxt && \
+    echo -e 'parameters [{ key: "execute_delay_ms" \n value: { string_value: "500" } }, { key: "enable_custom_tracing" \n value: { string_value: "true" } }]' >> config.pbtxt)
+
 RET=0
 
 # Helpers =======================================
@@ -742,6 +752,60 @@ wait $SERVER_PID
 
 set +e
 
+# Custom backend tracing
+SERVER_ARGS="--model-control-mode=explicit --model-repository=$MODELSDIR
+            --load-model=custom_identity_int32 --trace-config=level=TIMESTAMPS \
+            --trace-config=triton,file=custom_tracing_triton.log \
+            --trace-config=rate=1 --trace-config=mode=triton"
+SERVER_LOG="./custom_backend_tracing.log"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+# Send 1 inference request, should expect 3 custom activities:
+# CUSTOM_SINGLE_ACTIVITY, CUSTOM_ACTIVITY_START, CUSTOM_ACTIVITY_END
+rm -f ./curl.out
+data='{"inputs":[{"name":"INPUT0","datatype":"INT32","shape":[1,1],"data":[4]}]}'
+set +e
+code=`curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/models/custom_identity_int32/infer -d ${data}`
+set -e
+if [ "$code" != "200" ]; then
+    cat ./curl.out
+    echo -e "\n***\n*** Test Failed\n***"
+    RET=1
+fi
+
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+set +e
+
+
+$TRACE_SUMMARY -t custom_tracing_triton.log > summary_custom_tracing_triton.log
+
+if [ `grep -c "CUSTOM_SINGLE_ACTIVITY" summary_custom_tracing_triton.log` != "1" ]; then
+    cat summary_custom_tracing_triton.log
+    echo -e "\n***\n*** Test Failed: Unexpected number of traced "CUSTOM_ACTIVITY" events.\n***"
+    RET=1
+fi
+
+if [ `grep -c "CUSTOM_ACTIVITY_START" summary_custom_tracing_triton.log` != "1" ]; then
+    cat summary_custom_tracing_triton.log
+    echo -e "\n***\n*** Test Failed: Unexpected number of traced "CUSTOM_ACTIVITY_START" events.\n***"
+    RET=1
+fi
+
+if [ `grep -c "CUSTOM_ACTIVITY_END" summary_custom_tracing_triton.log` != "1" ]; then
+    cat summary_custom_tracing_triton.log
+    echo -e "\n***\n*** Test Failed: Unexpected number of traced "CUSTOM_ACTIVITY_END" events.\n***"
+    RET=1
+fi
+
 # Check opentelemetry trace exporter sends proper info.
 # A helper python script starts listening on $OTLP_PORT, where
 # OTLP exporter sends traces.
@@ -758,7 +822,7 @@ rm collected_traces.json*
 # Unittests then check that produced spans have expected format and events
 OPENTELEMETRY_TEST=opentelemetry_unittest.py
 OPENTELEMETRY_LOG="opentelemetry_unittest.log"
-EXPECTED_NUM_TESTS="17"
+EXPECTED_NUM_TESTS="19"
 
 # Set up repo and args for SageMaker
 export SAGEMAKER_TRITON_DEFAULT_MODEL_NAME="simple"
@@ -772,10 +836,20 @@ cp -r $DATADIR/$MODELBASE/* ${MODEL_PATH} && \
 # Add model to test trace context exposed to python backend
 mkdir -p $MODELSDIR/trace_context/1 && cp ./trace_context.py $MODELSDIR/trace_context/1/model.py
 
+# set up identity model
+rm -r ${MODELSDIR}/custom_identity_int32
+mkdir -p $MODELSDIR/custom_identity_int32/1 && (cd $MODELSDIR/custom_identity_int32 && \
+    echo 'name: "custom_identity_int32"' >> config.pbtxt && \
+    echo 'backend: "identity"' >> config.pbtxt && \
+    echo 'max_batch_size: 1024' >> config.pbtxt && \
+    echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_INT32 \n dims: [ -1 ] }]' >> config.pbtxt && \
+    echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_INT32 \n dims: [ -1 ] }]' >> config.pbtxt && \
+    echo 'instance_group [{ kind: KIND_CPU }]' >> config.pbtxt && \
+    echo -e 'parameters [{ key: "execute_delay_ms" \n value: { string_value: "500" } }, { key: "enable_custom_tracing" \n value: { string_value: "true" } }, { key: "nested_span_count" \n value: { string_value: "6" } }, { key: "single_activity_frequency" \n value: { string_value: "3" } }]' >> config.pbtxt)
 
 SERVER_ARGS="--allow-sagemaker=true --model-control-mode=explicit \
                 --load-model=simple --load-model=ensemble_add_sub_int32_int32_int32 \
-                --load-model=repeat_int32 \
+                --load-model=repeat_int32 --load-model=custom_identity_int32\
                 --load-model=input_all_required \
                 --load-model=dynamic_batch \
                 --load-model=bls_simple --trace-config=level=TIMESTAMPS \
@@ -1164,5 +1238,4 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 set +e
-
 exit $RET
diff --git a/src/tracer.cc b/src/tracer.cc
index b17f5eb7e8..5557106dfd 100644
--- a/src/tracer.cc
+++ b/src/tracer.cc
@@ -28,8 +28,6 @@
 
 #include <stdlib.h>
 
-#include <unordered_map>
-
 #include "common.h"
 #include "triton/common/logging.h"
 #ifdef TRITON_ENABLE_GPU
@@ -410,6 +408,32 @@ TraceManager::Trace::CaptureTimestamp(
   }
 }
 
+std::string
+TraceManager::Trace::RetrieveActivityName(
+    TRITONSERVER_InferenceTrace* trace,
+    TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns)
+{
+  std::string activity_name =
+      TRITONSERVER_InferenceTraceActivityString(activity);
+
+  if (activity == TRITONSERVER_TRACE_CUSTOM_ACTIVITY) {
+    const char* val = nullptr;
+    LOG_TRITONSERVER_ERROR(
+        TRITONSERVER_InferenceTraceContext(trace, &val),
+        "Failed to retrieve trace context");
+    std::string context_str = (val != nullptr) ? std::string(val) : "";
+    triton::common::TritonJson::Value context;
+    LOG_TRITONSERVER_ERROR(
+        context.Parse(context_str), "Failed to parse trace context");
+    std::string look_for_key = std::to_string(timestamp_ns);
+    if (context.Find(look_for_key.c_str())) {
+      context.MemberAsString(look_for_key.c_str(), &activity_name);
+    }
+  }
+
+  return activity_name;
+}
+
 void
 TraceManager::InitTracer(const triton::server::TraceConfigMap& config_map)
 {
@@ -514,7 +538,7 @@ void
 TraceManager::Trace::StartSpan(
     TRITONSERVER_InferenceTrace* trace,
     TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
-    uint64_t trace_id)
+    uint64_t trace_id, std::string display_name)
 {
   uint64_t parent_id;
   LOG_TRITONSERVER_ERROR(
@@ -533,19 +557,10 @@ TraceManager::Trace::StartSpan(
   // the ensamble's main request. For this instance, the parent
   // span is the ensembles's request span.
   if ((parent_id == 0 && activity == TRITONSERVER_TRACE_REQUEST_START) ||
-      (activity == TRITONSERVER_TRACE_COMPUTE_START)) {
+      (activity == TRITONSERVER_TRACE_COMPUTE_START) ||
+      (activity == TRITONSERVER_TRACE_CUSTOM_ACTIVITY)) {
     span_parent_id = trace_id;
   }
-
-  std::string display_name = "compute";
-  const char* model_name;
-  if (activity == TRITONSERVER_TRACE_REQUEST_START) {
-    LOG_TRITONSERVER_ERROR(
-        TRITONSERVER_InferenceTraceModelName(trace, &model_name),
-        "getting model name");
-    display_name = model_name;
-  }
-
   auto span = StartSpan(display_name, timestamp_ns, span_parent_id);
 
   if (activity == TRITONSERVER_TRACE_REQUEST_START) {
@@ -557,7 +572,7 @@ TraceManager::Trace::StartSpan(
     LOG_TRITONSERVER_ERROR(
         TRITONSERVER_InferenceTraceRequestId(trace, &request_id),
         "getting request id");
-    span->SetAttribute("triton.model_name", model_name);
+    span->SetAttribute("triton.model_name", display_name);
     span->SetAttribute("triton.model_version", model_version);
     span->SetAttribute("triton.trace_id", trace_id);
     span->SetAttribute("triton.trace_parent_id", parent_id);
@@ -642,17 +657,40 @@ TraceManager::Trace::AddEvent(
     TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
     uint64_t trace_id)
 {
+  std::string activity_name =
+      RetrieveActivityName(trace, activity, timestamp_ns);
+  static std::string start = "_START";
+  static std::string end = "_END";
   if (activity == TRITONSERVER_TRACE_REQUEST_START ||
-      activity == TRITONSERVER_TRACE_COMPUTE_START) {
-    StartSpan(trace, activity, timestamp_ns, trace_id);
+      activity == TRITONSERVER_TRACE_COMPUTE_START ||
+      (activity == TRITONSERVER_TRACE_CUSTOM_ACTIVITY &&
+       activity_name.length() > start.length() &&
+       std::equal(start.rbegin(), start.rend(), activity_name.rbegin()))) {
+    std::string span_name = activity_name;
+
+    if (activity == TRITONSERVER_TRACE_CUSTOM_ACTIVITY) {
+      span_name =
+          activity_name.substr(0, activity_name.length() - start.length());
+    } else if (activity == TRITONSERVER_TRACE_REQUEST_START) {
+      const char* model_name;
+      LOG_TRITONSERVER_ERROR(
+          TRITONSERVER_InferenceTraceModelName(trace, &model_name),
+          "getting model name");
+      span_name = model_name;
+    } else if (activity == TRITONSERVER_TRACE_COMPUTE_START) {
+      span_name = "compute";
+    }
+
+    StartSpan(trace, activity, timestamp_ns, trace_id, span_name);
   }
 
-  AddEvent(
-      TRITONSERVER_InferenceTraceActivityString(activity), timestamp_ns,
-      trace_id);
+  AddEvent(activity_name, timestamp_ns, trace_id);
 
   if (activity == TRITONSERVER_TRACE_REQUEST_END ||
-      activity == TRITONSERVER_TRACE_COMPUTE_END) {
+      activity == TRITONSERVER_TRACE_COMPUTE_END ||
+      (activity == TRITONSERVER_TRACE_CUSTOM_ACTIVITY &&
+       activity_name.length() > end.length() &&
+       std::equal(end.rbegin(), end.rend(), activity_name.rbegin()))) {
     EndSpan(timestamp_ns, trace_id);
   }
 }
@@ -805,7 +843,8 @@ TraceManager::TraceActivity(
   }
 
   *ss << "{\"id\":" << id << ",\"timestamps\":["
-      << "{\"name\":\"" << TRITONSERVER_InferenceTraceActivityString(activity)
+      << "{\"name\":\""
+      << ts->RetrieveActivityName(trace, activity, timestamp_ns)
       << "\",\"ns\":" << timestamp_ns << "}]}";
 }
 
diff --git a/src/tracer.h b/src/tracer.h
index e33b16dbcb..8cdeb15121 100644
--- a/src/tracer.h
+++ b/src/tracer.h
@@ -231,6 +231,19 @@ class TraceManager {
     // with this trace.
     void CaptureTimestamp(const std::string& name, uint64_t timestamp_ns);
 
+    /// Returns activity name. For custom activities, retrieves the name from
+    /// the trace context. For other activities, returns default name.
+    ///
+    /// \param trace TRITONSERVER_InferenceTrace instance.
+    /// \param activity  Trace activity.
+    /// \param timestamp_ns Steady timestamp, which is used to calculate
+    /// OpenTelemetry SystemTimestamp to display span on a timeline, and
+    /// OpenTelemetry SteadyTimestamp to calculate the duration on the span
+    /// with better precision.
+    std::string RetrieveActivityName(
+        TRITONSERVER_InferenceTrace* trace,
+        TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns);
+
 #if !defined(_WIN32) && defined(TRITON_ENABLE_TRACING)
     /// Reports TRITONSERVER_InferenceTraceActivity as event to
     /// the currently active span. If activity is an instance of
@@ -322,10 +335,11 @@ class TraceManager {
     /// OpenTelemetry SteadyTimestamp to calculate the duration on the span
     /// with better precision.
     /// \param trace_id Trace id.
+    /// \param display_name Span name.
     void StartSpan(
         TRITONSERVER_InferenceTrace* trace,
         TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
-        uint64_t trace_id);
+        uint64_t trace_id, std::string display_name);
 
     /// Ends the span on the top of the stack, related to trace with `trace_id`.
     ///

From dfbe63efabf78fd60c5d33c0b7cef6ab41edb3bb Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Mon, 8 Jul 2024 08:54:50 -0700
Subject: [PATCH 05/10] build: Reduce intermediate layers (#7408)

---
 build.py | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/build.py b/build.py
index b9e68d45cc..24bde0f3a4 100755
--- a/build.py
+++ b/build.py
@@ -1082,25 +1082,20 @@ def create_dockerfile_linux(
 """
     if "tensorrtllm" in backends:
         df += """
-
-RUN ldconfig
-# Remove contents that are not needed in runtime
-RUN ARCH="$(uname -i)" \\
-      && rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data \\
-      && rm -fr  ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python \\
-      && rm -fr ${TRT_ROOT}/samples  ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples
-
 # Install required packages for TRT-LLM models
-RUN python3 -m pip install --upgrade pip \\
-      && pip3 install transformers
-
-# ldconfig for TRT-LLM
-RUN find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf
-RUN find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf
-
+# Remove contents that are not needed in runtime
 # Setuptools has breaking changes in version 70.0.0, so fix it to 69.5.1
 # The generated code in grpc_service_pb2_grpc.py depends on grpcio>=1.64.0, so fix it to 1.64.0
-RUN pip3 install setuptools==69.5.1 grpcio-tools==1.64.0
+RUN ldconfig && \
+    ARCH="$(uname -i)" && \
+    rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \
+    rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \
+    rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \
+    python3 -m pip install --upgrade pip && \
+    pip3 install --no-cache-dir transformers && \
+    find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \
+    find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \
+    pip3 install --no-cache-dir setuptools==69.5.1 grpcio-tools==1.64.0
 
 ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH
 """

From e0d80d46966bdf83d5dedf5b76f08e0aa40607ec Mon Sep 17 00:00:00 2001
From: Jacky <18255193+kthui@users.noreply.github.com>
Date: Mon, 8 Jul 2024 16:10:58 -0700
Subject: [PATCH 06/10] test: Remove AWS bucket on test failure (#7342)

Co-authored-by: Kris Hung <krish@nvidia.com>
---
 qa/L0_backend_python/env/test.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/qa/L0_backend_python/env/test.sh b/qa/L0_backend_python/env/test.sh
index b6963be615..ff9e368e75 100755
--- a/qa/L0_backend_python/env/test.sh
+++ b/qa/L0_backend_python/env/test.sh
@@ -253,6 +253,7 @@ run_server
 if [ "$SERVER_PID" == "0" ]; then
     echo -e "\n***\n*** Failed to start $SERVER\n***"
     cat $SERVER_LOG
+    aws s3 rb "${BUCKET_URL}" --force || true
     exit 1
 fi
 
@@ -286,6 +287,7 @@ run_server
 if [ "$SERVER_PID" == "0" ]; then
     echo -e "\n***\n*** Failed to start $SERVER\n***"
     cat $SERVER_LOG
+    aws s3 rb "${BUCKET_URL}" --force || true
     exit 1
 fi
 

From ac0d4d6f2f75102143b6c2a64497c91479ed069f Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Wed, 10 Jul 2024 10:37:49 -0700
Subject: [PATCH 07/10] fix: Fix error message for L0_trt_compat (#7432)

---
 qa/L0_trt_compat/test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qa/L0_trt_compat/test.sh b/qa/L0_trt_compat/test.sh
index 6b4f83cbc8..a8161369df 100755
--- a/qa/L0_trt_compat/test.sh
+++ b/qa/L0_trt_compat/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -64,7 +64,7 @@ if [ "$SERVER_PID" != "0" ]; then
     exit 1
 fi
 
-EXPECTED_ERR="Internal Error (Cannot deserialize engine with lean runtime"
+EXPECTED_ERR="Cannot deserialize engine with lean runtime"
 if ! grep "$EXPECTED_ERR" $SERVER_LOG; then
     cat $SERVER_LOG
     echo -e "\n***\n*** Failed to find expected error: ${EXPECTED_ERR} \n***"

From d1780d1fb04d81f04e605383a99432b6f7b26ee4 Mon Sep 17 00:00:00 2001
From: Shreyas Jain <shreyas.jain@samsung.com>
Date: Thu, 11 Jul 2024 00:40:48 +0530
Subject: [PATCH 08/10] feat: Support for request id field in generate API
 (#7392)

---
 docs/protocol/extension_generate.md  | 12 ++++++--
 qa/L0_http/generate_endpoint_test.py | 43 ++++++++++++++++++++++++++++
 qa/L0_http/test.sh                   |  2 +-
 src/http_server.cc                   |  2 ++
 4 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/docs/protocol/extension_generate.md b/docs/protocol/extension_generate.md
index b54b0caffb..043339eb4a 100644
--- a/docs/protocol/extension_generate.md
+++ b/docs/protocol/extension_generate.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -87,10 +87,12 @@ return an error.
 
     $generate_request =
     {
+      "id" : $string, #optional
       "text_input" : $string,
       "parameters" : $parameters #optional
     }
 
+* "id": An identifier for this request. Optional, but if specified this identifier must be returned in the response.
 * "text_input" : The text input that the model should generate output from.
 * "parameters" : An optional object containing zero or more parameters for this
   generate request expressed as key/value pairs. See
@@ -121,14 +123,15 @@ specification to set the parameters.
 Below is an example to send generate request with additional model parameters `stream` and `temperature`.
 
 ```
-$ curl -X POST localhost:8000/v2/models/mymodel/generate -d '{"text_input": "client input", "parameters": {"stream": false, "temperature": 0}}'
+$ curl -X POST localhost:8000/v2/models/mymodel/generate -d '{"id": "42", "text_input": "client input", "parameters": {"stream": false, "temperature": 0}}'
 
 POST /v2/models/mymodel/generate HTTP/1.1
 Host: localhost:8000
 Content-Type: application/json
 Content-Length: <xx>
 {
-  "text_input":  "client input",
+  "id" : "42",
+  "text_input" :  "client input",
   "parameters" :
     {
       "stream": false,
@@ -145,11 +148,13 @@ the HTTP body.
 
     $generate_response =
     {
+      "id" : $string
       "model_name" : $string,
       "model_version" : $string,
       "text_output" : $string
     }
 
+* "id" : The "id" identifier given in the request, if any.
 * "model_name" : The name of the model used for inference.
 * "model_version" : The specific model version used for inference.
 * "text_output" : The output of the inference.
@@ -159,6 +164,7 @@ the HTTP body.
 ```
 200
 {
+  "id" : "42"
   "model_name" : "mymodel",
   "model_version" : "1",
   "text_output" : "model output"
diff --git a/qa/L0_http/generate_endpoint_test.py b/qa/L0_http/generate_endpoint_test.py
index 8c44ad8419..a9a972e02a 100755
--- a/qa/L0_http/generate_endpoint_test.py
+++ b/qa/L0_http/generate_endpoint_test.py
@@ -142,6 +142,49 @@ def test_generate(self):
         self.assertIn("TEXT", data)
         self.assertEqual(text, data["TEXT"])
 
+    def test_request_id(self):
+        # Setup text based input
+        text = "hello world"
+        request_id = "42"
+
+        # Test when request id in request body
+        inputs = {"PROMPT": text, "id": request_id, "STREAM": False}
+        r = self.generate(self._model_name, inputs)
+        r.raise_for_status()
+
+        self.assertIn("Content-Type", r.headers)
+        self.assertEqual(r.headers["Content-Type"], "application/json")
+
+        data = r.json()
+        self.assertIn("id", data)
+        self.assertEqual(request_id, data["id"])
+        self.assertIn("TEXT", data)
+        self.assertEqual(text, data["TEXT"])
+
+        # Test when request id not in request body
+        inputs = {"PROMPT": text, "STREAM": False}
+        r = self.generate(self._model_name, inputs)
+        r.raise_for_status()
+
+        self.assertIn("Content-Type", r.headers)
+        self.assertEqual(r.headers["Content-Type"], "application/json")
+
+        data = r.json()
+        self.assertNotIn("id", data)
+
+        # Test when request id is empty
+        inputs = {"PROMPT": text, "id": "", "STREAM": False}
+        r = self.generate(self._model_name, inputs)
+        r.raise_for_status()
+
+        self.assertIn("Content-Type", r.headers)
+        self.assertEqual(r.headers["Content-Type"], "application/json")
+
+        data = r.json()
+        self.assertNotIn("id", data)
+        self.assertIn("TEXT", data)
+        self.assertEqual(text, data["TEXT"])
+
     def test_generate_stream(self):
         # Setup text-based input
         text = "hello world"
diff --git a/qa/L0_http/test.sh b/qa/L0_http/test.sh
index 6025adf969..321c398995 100755
--- a/qa/L0_http/test.sh
+++ b/qa/L0_http/test.sh
@@ -662,7 +662,7 @@ fi
 ## Python Unit Tests
 TEST_RESULT_FILE='test_results.txt'
 PYTHON_TEST=generate_endpoint_test.py
-EXPECTED_NUM_TESTS=15
+EXPECTED_NUM_TESTS=16
 set +e
 python $PYTHON_TEST >$CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
diff --git a/src/http_server.cc b/src/http_server.cc
index cd3f5853ca..68b22ae649 100644
--- a/src/http_server.cc
+++ b/src/http_server.cc
@@ -3327,6 +3327,8 @@ HTTPAPIServer::HandleGenerate(
   //   thus the string must live as long as the JSON message).
   triton::common::TritonJson::Value request;
   RETURN_AND_CALLBACK_IF_ERR(EVRequestToJson(req, &request), error_callback);
+  RETURN_AND_CALLBACK_IF_ERR(
+      ParseJsonTritonRequestID(request, irequest), error_callback);
 
   RETURN_AND_CALLBACK_IF_ERR(
       generate_request->ConvertGenerateRequest(

From 3dbf09e19c51dbf4fea584b9dc3a5da4552da994 Mon Sep 17 00:00:00 2001
From: Jacky <18255193+kthui@users.noreply.github.com>
Date: Thu, 11 Jul 2024 17:24:56 -0700
Subject: [PATCH 09/10] perf: Improve response throughput of a single gRPC
 stream (#7404)

---
 src/grpc/infer_handler.h         |   5 +-
 src/grpc/stream_infer_handler.cc | 196 ++++++++++++++++++++-----------
 src/grpc/stream_infer_handler.h  |   3 +-
 3 files changed, 133 insertions(+), 71 deletions(-)

diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 6ef03807a2..0e1091feb8 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -1,4 +1,4 @@
-// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -979,6 +979,9 @@ class InferHandlerState {
     // Tracks all the states that have been created on this context.
     std::set<InferHandlerStateType*> all_states_;
 
+    // Ready to write queue for decoupled
+    std::queue<InferHandlerStateType*> ready_to_write_states_;
+
     // The step of the entire context.
     Steps step_;
 
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 269808c78a..585f88d536 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -359,13 +359,38 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       response->mutable_infer_response()->Clear();
       // repopulate the id so that client knows which request failed.
       response->mutable_infer_response()->set_id(request.id());
-      state->step_ = Steps::WRITEREADY;
       if (!state->is_decoupled_) {
+        state->step_ = Steps::WRITEREADY;
         state->context_->WriteResponseIfReady(state);
       } else {
-        state->response_queue_->MarkNextResponseComplete();
-        state->complete_ = true;
-        state->context_->PutTaskBackToQueue(state);
+        InferHandler::State* writing_state = nullptr;
+        std::lock_guard<std::recursive_mutex> lk1(state->context_->mu_);
+        {
+          std::lock_guard<std::recursive_mutex> lk2(state->step_mtx_);
+          state->response_queue_->MarkNextResponseComplete();
+          state->context_->ready_to_write_states_.push(state);
+          if (!state->context_->ongoing_write_) {
+            // Only one write is allowed per gRPC stream / context at any time.
+            // If the stream is not currently writing, start writing the next
+            // ready to write response from the next ready to write state from
+            // 'ready_to_write_states_'. If there are other responses on the
+            // state ready to be written after starting the write, the state
+            // will be placed at the back of the 'ready_to_write_states_'. If
+            // there are no other response, the state will be marked as 'ISSUED'
+            // if complete final flag is not received yet from the backend or
+            // completed if complete final flag is received.
+            // The 'ongoing_write_' will reset once the completion queue returns
+            // a written state and no additional response on the stream is ready
+            // to be written.
+            state->context_->ongoing_write_ = true;
+            writing_state = state->context_->ready_to_write_states_.front();
+            state->context_->ready_to_write_states_.pop();
+          }
+          state->complete_ = true;
+        }
+        if (writing_state != nullptr) {
+          StateWriteResponse(writing_state);
+        }
       }
     }
 
@@ -451,7 +476,6 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     //  Decoupled state transitions
     //
     if (state->step_ == Steps::WRITTEN) {
-      state->context_->ongoing_write_ = false;
 #ifdef TRITON_ENABLE_TRACING
       state->trace_timestamps_.emplace_back(
           std::make_pair("GRPC_SEND_END", TraceManager::CaptureTimestamp()));
@@ -469,54 +493,44 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
         state->context_->finish_ok_ = false;
       }
 
-      // Finish the state if all the transactions associated with
-      // the state have completed.
-      if (state->IsComplete()) {
-        state->context_->DecrementRequestCounter();
-        finished = Finish(state);
-      } else {
-        std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
-
-        // If there is an available response to be written
-        // to the stream, then transition directly to WRITEREADY
-        // state and enqueue itself to the completion queue to be
-        // taken up later. Otherwise, go to ISSUED state and wait
-        // for the callback to make a response available.
-        if (state->response_queue_->HasReadyResponse()) {
-          state->step_ = Steps::WRITEREADY;
-          state->context_->PutTaskBackToQueue(state);
-        } else {
-          state->step_ = Steps::ISSUED;
+      {
+        InferHandler::State* writing_state = nullptr;
+        std::lock_guard<std::recursive_mutex> lk1(state->context_->mu_);
+        {
+          std::lock_guard<std::recursive_mutex> lk2(state->step_mtx_);
+          if (!state->context_->ready_to_write_states_.empty()) {
+            writing_state = state->context_->ready_to_write_states_.front();
+            state->context_->ready_to_write_states_.pop();
+          } else {
+            state->context_->ongoing_write_ = false;
+          }
+          // Finish the state if all the transactions associated with
+          // the state have completed.
+          if (state != writing_state) {
+            if (state->IsComplete()) {
+              state->context_->DecrementRequestCounter();
+              finished = Finish(state);
+            } else {
+              state->step_ = Steps::ISSUED;
+            }
+          }
+        }
+        if (writing_state != nullptr) {
+          StateWriteResponse(writing_state);
         }
       }
     } else if (state->step_ == Steps::WRITEREADY) {
-      if (state->delay_response_ms_ != 0) {
-        // Will delay the write of the response by the specified time.
-        // This can be used to test the flow where there are other
-        // responses available to be written.
-        LOG_INFO << "Delaying the write of the response by "
-                 << state->delay_response_ms_ << " ms...";
-        std::this_thread::sleep_for(
-            std::chrono::milliseconds(state->delay_response_ms_));
-      }
-
       // Finish the state if all the transactions associated with
       // the state have completed.
       if (state->IsComplete()) {
         state->context_->DecrementRequestCounter();
         finished = Finish(state);
       } else {
-        // GRPC doesn't allow to issue another write till
-        // the notification from previous write has been
-        // delivered. If there is an ongoing write then
-        // defer writing and place the task at the back
-        // of the completion queue to be taken up later.
-        if (!state->context_->ongoing_write_) {
-          state->context_->ongoing_write_ = true;
-          state->context_->DecoupledWriteResponse(state);
-        } else {
-          state->context_->PutTaskBackToQueue(state);
-        }
+        LOG_ERROR << "Should not print this! Decoupled should NOT write via "
+                     "WRITEREADY!";
+        // Remove the state from the completion queue
+        std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
+        state->step_ = Steps::ISSUED;
       }
     }
   }
@@ -524,6 +538,31 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
   return !finished;
 }
 
+// For decoupled only. Caller must ensure exclusive write.
+void
+ModelStreamInferHandler::StateWriteResponse(InferHandler::State* state)
+{
+  if (state->delay_response_ms_ != 0) {
+    // Will delay the write of the response by the specified time.
+    // This can be used to test the flow where there are other
+    // responses available to be written.
+    LOG_INFO << "Delaying the write of the response by "
+             << state->delay_response_ms_ << " ms...";
+    std::this_thread::sleep_for(
+        std::chrono::milliseconds(state->delay_response_ms_));
+  }
+  {
+    std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
+    state->step_ = Steps::WRITTEN;
+    // gRPC doesn't allow to issue another write till the notification from
+    // previous write has been delivered.
+    state->context_->DecoupledWriteResponse(state);
+    if (state->response_queue_->HasReadyResponse()) {
+      state->context_->ready_to_write_states_.push(state);
+    }
+  }
+}
+
 bool
 ModelStreamInferHandler::Finish(InferHandler::State* state)
 {
@@ -701,45 +740,64 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     }
   }
 
-  // Update states to signal that response/error is ready to write to stream
-  {
+  if (state->IsGrpcContextCancelled()) {
     // Need to hold lock because the handler thread processing context
     // cancellation might have cancelled or marked the state for cancellation.
     std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
 
-    if (state->IsGrpcContextCancelled()) {
-      LOG_VERBOSE(1)
-          << "ModelStreamInferHandler::StreamInferResponseComplete, "
-          << state->unique_id_
-          << ", skipping writing response because of transaction was cancelled";
-
-      // If this was the final callback for the state
-      // then cycle through the completion queue so
-      // that state object can be released.
-      if (is_complete) {
-        state->step_ = Steps::CANCELLED;
-        state->context_->PutTaskBackToQueue(state);
-      }
+    LOG_VERBOSE(1)
+        << "ModelStreamInferHandler::StreamInferResponseComplete, "
+        << state->unique_id_
+        << ", skipping writing response because of transaction was cancelled";
 
-      state->complete_ = is_complete;
-      return;
+    // If this was the final callback for the state
+    // then cycle through the completion queue so
+    // that state object can be released.
+    if (is_complete) {
+      state->step_ = Steps::CANCELLED;
+      state->context_->PutTaskBackToQueue(state);
     }
 
-    if (state->is_decoupled_) {
+    state->complete_ = is_complete;
+    return;
+  }
+
+  if (state->is_decoupled_) {
+    InferHandler::State* writing_state = nullptr;
+    std::lock_guard<std::recursive_mutex> lk1(state->context_->mu_);
+    {
+      std::lock_guard<std::recursive_mutex> lk2(state->step_mtx_);
+      bool has_prev_ready_response = state->response_queue_->HasReadyResponse();
       if (response) {
         state->response_queue_->MarkNextResponseComplete();
       }
-      if (state->step_ == Steps::ISSUED) {
+      if (!has_prev_ready_response && response) {
+        state->context_->ready_to_write_states_.push(state);
+      }
+      if (!state->context_->ongoing_write_ &&
+          !state->context_->ready_to_write_states_.empty()) {
+        state->context_->ongoing_write_ = true;
+        writing_state = state->context_->ready_to_write_states_.front();
+        state->context_->ready_to_write_states_.pop();
+      }
+      if (is_complete && state->response_queue_->IsEmpty() &&
+          state->step_ == Steps::ISSUED) {
+        // The response queue is empty and complete final flag is received, so
+        // mark the state as 'WRITEREADY' so it can be cleaned up later.
         state->step_ = Steps::WRITEREADY;
         state->context_->PutTaskBackToQueue(state);
       }
-    } else {
-      state->step_ = Steps::WRITEREADY;
-      if (is_complete) {
-        state->context_->WriteResponseIfReady(state);
-      }
+      state->complete_ = is_complete;
+    }
+    if (writing_state != nullptr) {
+      StateWriteResponse(writing_state);
+    }
+  } else {  // non-decoupled
+    std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
+    state->step_ = Steps::WRITEREADY;
+    if (is_complete) {
+      state->context_->WriteResponseIfReady(state);
     }
-
     state->complete_ = is_complete;
   }
 }
diff --git a/src/grpc/stream_infer_handler.h b/src/grpc/stream_infer_handler.h
index 60c4530227..e5163eac59 100644
--- a/src/grpc/stream_infer_handler.h
+++ b/src/grpc/stream_infer_handler.h
@@ -1,4 +1,4 @@
-// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -112,6 +112,7 @@ class ModelStreamInferHandler
   static void StreamInferResponseComplete(
       TRITONSERVER_InferenceResponse* response, const uint32_t flags,
       void* userp);
+  static void StateWriteResponse(InferHandler::State* state);
   bool Finish(State* state);
 
   TraceManager* trace_manager_;

From 70a0eeeb58d1c67b247d6a77fd1c2c226c87ad3c Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Thu, 11 Jul 2024 18:24:52 -0700
Subject: [PATCH 10/10] test: Tests for Metrics API enhancement to include
 error counters (#7423)

---
 docs/user_guide/metrics.md                    | 19 +++++++
 .../lifecycle/lifecycle_test.py               | 35 ++++++++++++-
 qa/L0_model_queue/model_queue_test.py         | 50 ++++++++++++++++++-
 qa/L0_request_cancellation/scheduler_test.py  | 43 +++++++++++++++-
 4 files changed, 144 insertions(+), 3 deletions(-)

diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
index 1e70bac86c..8eb26d0bf5 100644
--- a/docs/user_guide/metrics.md
+++ b/docs/user_guide/metrics.md
@@ -100,6 +100,25 @@ Count*. The count metrics are illustrated by the following examples:
 |              |Execution Count |`nv_inference_exec_count` |Number of inference batch executions (see [Inference Request Metrics](#inference-request-metrics), does not include cached requests)|Per model|Per request|
 |              |Pending Request Count |`nv_inference_pending_request_count` |Number of inference requests awaiting execution by a backend. This number is incremented when a request is enqueued to the server (`TRITONSERVER_ServerInferAsync`) and is decremented when a backend is about to start executing the request. More details can be found below. |Per model|Per request|
 
+#### Failure Count Categories
+
+| Failed Request Reason |Description |
+|------------|------------|
+| REJECTED  | Number of inference failures due to request timeout in the schedular. |
+| CANCELED  |  Number of inference failures due to request cancellation in the core. |
+| BACKEND |  Number of inference failures during execution of requests in the backend/model. |
+| OTHER  | Number of inference failures due to other uncategorized reasons in the core. |
+
+> **Note**
+>
+> Ensemble failure metrics will reflect the failure counts of their composing models as well as the parent model, but currently do not capture the same granularity for the "reason" label and will default to the "OTHER" reason.
+>
+> For example, if EnsembleA contains ModelA, and ModelA experiences a failed request due to a queue/backlog timeout in the scheduler, ModelA will have a failed request metric reflecting `reason=REJECTED` and `count=1`.
+> Additionally, EnsembleA will have a failed request metric reflecting `reason=OTHER` and `count=2`.
+> The `count=2` reflects 1 from the internally failed request captured by ModelA, as well as 1 from the failed top-level request sent to EnsembleA by the user/client.
+> The `reason=OTHER` reflects that fact that the ensemble doesn't currently capture the specific reason why
+> ModelA's request failed at this time.
+
 #### Pending Request Count (Queue Size) Per-Model
 
 The *Pending Request Count* reflects the number of requests that have been
diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
index cea94a1dad..883f6d20b6 100755
--- a/qa/L0_backend_python/lifecycle/lifecycle_test.py
+++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -27,8 +27,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import re
 import sys
 
+import requests
+
 sys.path.append("../../common")
 
 import queue
@@ -63,6 +66,29 @@ class LifecycleTest(unittest.TestCase):
     def setUp(self):
         self._shm_leak_detector = shm_util.ShmLeakDetector()
 
+    def _get_metrics(self):
+        metrics_url = "http://localhost:8002/metrics"
+        r = requests.get(metrics_url)
+        r.raise_for_status()
+        return r.text
+
+    def _metrics_before_test(self, model, reason):
+        pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)'
+        metrics = self._get_metrics()
+        match = re.search(pattern, metrics)
+        if match:
+            return int(match.group(1))
+        else:
+            raise Exception(f"Failure metrics for model='{model}' not found")
+
+    def _assert_metrics(
+        self, model_name, reason, expected_count_increase, initial_count
+    ):
+        metrics = self._get_metrics()
+        # Add initial count + expected count for the the test
+        expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}'
+        self.assertIn(expected_metric, metrics)
+
     def test_error_code(self):
         model_name = "error_code"
         shape = [1, 1]
@@ -181,7 +207,7 @@ def test_batch_error(self):
     def test_infer_pymodel_error(self):
         model_name = "wrong_model"
         shape = [2, 2]
-
+        initial_metrics_value = self._metrics_before_test(model_name, "BACKEND")
         with self._shm_leak_detector.Probe() as shm_probe:
             with httpclient.InferenceServerClient(
                 f"{_tritonserver_ipaddr}:8000"
@@ -207,6 +233,13 @@ def test_infer_pymodel_error(self):
                     self.assertTrue(
                         False, "Wrong exception raised or did not raise an exception"
                     )
+        expected_count_increase = 1
+        self._assert_metrics(
+            model_name,
+            "BACKEND",
+            expected_count_increase,
+            initial_metrics_value,
+        )
 
 
 if __name__ == "__main__":
diff --git a/qa/L0_model_queue/model_queue_test.py b/qa/L0_model_queue/model_queue_test.py
index e7be471f79..025d126417 100755
--- a/qa/L0_model_queue/model_queue_test.py
+++ b/qa/L0_model_queue/model_queue_test.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,6 +30,7 @@
 
 sys.path.append("../common")
 
+import re
 import threading
 import time
 import unittest
@@ -38,6 +39,7 @@
 
 import infer_util as iu
 import numpy as np
+import requests
 import test_util as tu
 from tritonclientutils import InferenceServerException
 
@@ -69,6 +71,29 @@ def check_deferred_exception(self):
                 _deferred_exceptions.pop(0)
                 raise first_exception
 
+    def _get_metrics(self):
+        metrics_url = "http://localhost:8002/metrics"
+        r = requests.get(metrics_url)
+        r.raise_for_status()
+        return r.text
+
+    def _metrics_before_test(self, model, reason):
+        pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)'
+        metrics = self._get_metrics()
+        match = re.search(pattern, metrics)
+        if match:
+            return int(match.group(1))
+        else:
+            raise Exception(f"Failure metrics for model='{model}' not found")
+
+    def _assert_metrics(
+        self, model_name, reason, expected_count_increase, initial_count
+    ):
+        metrics = self._get_metrics()
+        # Add initial count + expected count for the the test
+        expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}'
+        self.assertIn(expected_metric, metrics)
+
     def check_response(
         self,
         bs,
@@ -235,6 +260,12 @@ def test_policy_reject(self):
         # requests are sent after 'default_timeout_microseconds'.
         # Expect the first request is timed-out and rejected, which makes the
         # second and third request be batched together and executed.
+        initial_metrics_value_ensemble = self._metrics_before_test(
+            "ensemble_zero_1_float32", "OTHER"
+        )
+        initial_metrics_value_custom = self._metrics_before_test(
+            "custom_zero_1_float32", "REJECTED"
+        )
         dtype = np.float32
         shapes = ([16],)
         for trial in self.trials_:
@@ -283,6 +314,23 @@ def test_policy_reject(self):
                 self.check_deferred_exception()
             except InferenceServerException as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
+        expected_count_increase = 4
+        # NOTE: Ensemble failure metrics will reflect the failure counts
+        # of their composing models as well as the parent model, but currently do not capture the same granularity
+        # for the "reason" label and will default to the "OTHER" reason.
+        self._assert_metrics(
+            "ensemble_zero_1_float32",
+            "OTHER",
+            expected_count_increase,
+            initial_metrics_value_ensemble,
+        )
+        expected_count_increase = 4
+        self._assert_metrics(
+            "custom_zero_1_float32",
+            "REJECTED",
+            expected_count_increase,
+            initial_metrics_value_custom,
+        )
 
     def test_timeout_override(self):
         # Send requests with batch sizes 1, 1, 3 where the first request
diff --git a/qa/L0_request_cancellation/scheduler_test.py b/qa/L0_request_cancellation/scheduler_test.py
index a6cd97efaa..900073ea7d 100755
--- a/qa/L0_request_cancellation/scheduler_test.py
+++ b/qa/L0_request_cancellation/scheduler_test.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -27,10 +27,12 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import concurrent.futures
+import re
 import time
 import unittest
 
 import numpy as np
+import requests
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import InferenceServerException
 
@@ -84,6 +86,29 @@ def _assert_streaming_response_is_cancelled(self, response):
                     cancelled_count += 1
         self.assertEqual(cancelled_count, 1)
 
+    def _get_metrics(self):
+        metrics_url = "http://localhost:8002/metrics"
+        r = requests.get(metrics_url)
+        r.raise_for_status()
+        return r.text
+
+    def _metrics_before_test(self, model, reason):
+        pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)'
+        metrics = self._get_metrics()
+        match = re.search(pattern, metrics)
+        if match:
+            return int(match.group(1))
+        else:
+            raise Exception(f"Failure metrics for model='{model}' not found")
+
+    def _assert_metrics(
+        self, model_name, reason, expected_count_increase, initial_count
+    ):
+        metrics = self._get_metrics()
+        # Add initial count + expected count for the the test
+        expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}'
+        self.assertIn(expected_metric, metrics)
+
     # Test queued requests on dynamic batch scheduler can be cancelled
     def test_dynamic_batch_scheduler_request_cancellation(self):
         model_name = "dynamic_batch"
@@ -114,6 +139,7 @@ def test_dynamic_batch_scheduler_request_cancellation(self):
     # Test backlogged requests on sequence batch scheduler can be cancelled
     def test_sequence_batch_scheduler_backlog_request_cancellation(self):
         model_name = "sequence_direct"
+        initial_metrics_value = self._metrics_before_test(model_name, "CANCELED")
         with concurrent.futures.ThreadPoolExecutor() as pool:
             # Saturate the single sequence slot
             saturate_thread = pool.submit(
@@ -149,11 +175,26 @@ def test_sequence_batch_scheduler_backlog_request_cancellation(self):
             self._assert_response_is_cancelled(backlog_requests[1]["response"])
             # Join saturating thread
             saturate_thread.result()
+        expected_count_increase = 2
+        self._assert_metrics(
+            model_name,
+            "CANCELED",
+            expected_count_increase,
+            initial_metrics_value,
+        )
 
     # Test queued requests on direct sequence batch scheduler can be cancelled
     def test_direct_sequence_batch_scheduler_request_cancellation(self):
         model_name = "sequence_direct"
+        initial_metrics_value = self._metrics_before_test(model_name, "CANCELED")
         self._test_sequence_batch_scheduler_queued_request_cancellation(model_name)
+        expected_count_increase = 2
+        self._assert_metrics(
+            model_name,
+            "CANCELED",
+            expected_count_increase,
+            initial_metrics_value,
+        )
 
     # Test queued requests on oldest sequence batch scheduler can be cancelled
     def test_oldest_sequence_batch_scheduler_request_cancellation(self):