diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h
index ad04d57f8..42154cdca 100644
--- a/include/triton/core/tritonbackend.h
+++ b/include/triton/core/tritonbackend.h
@@ -1722,9 +1722,9 @@ TRITONBACKEND_BackendAttributeSetParallelModelInstanceLoading(
 ///
 /// \param batcher User-defined placeholder for backend to store and
 /// retrieve information about the batching strategy for this
-/// model.RITONBACKEND_ISPEC return a TRITONSERVER_Error indicating success or
-/// failure. \param model The backend model for which Triton is forming a batch.
-/// \return a TRITONSERVER_Error indicating success or failure.
+/// model. Returns a TRITONSERVER_Error indicating success
+/// or failure. \param model The backend model for which Triton is forming a
+/// batch. \return a TRITONSERVER_Error indicating success or failure.
 TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelBatcherInitialize(
     TRITONBACKEND_Batcher** batcher, TRITONBACKEND_Model* model);
 
diff --git a/include/triton/core/tritonserver.h b/include/triton/core/tritonserver.h
index b8e25df72..ef5a45d6a 100644
--- a/include/triton/core/tritonserver.h
+++ b/include/triton/core/tritonserver.h
@@ -91,7 +91,7 @@ struct TRITONSERVER_MetricFamily;
 ///   }
 ///
 #define TRITONSERVER_API_VERSION_MAJOR 1
-#define TRITONSERVER_API_VERSION_MINOR 32
+#define TRITONSERVER_API_VERSION_MINOR 33
 
 /// Get the TRITONBACKEND API version supported by the Triton shared
 /// library. This value can be compared against the
@@ -732,7 +732,8 @@ typedef enum tritonserver_traceactivity_enum {
   TRITONSERVER_TRACE_REQUEST_END = 6,
   TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT = 7,
   TRITONSERVER_TRACE_TENSOR_BACKEND_INPUT = 8,
-  TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT = 9
+  TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT = 9,
+  TRITONSERVER_TRACE_CUSTOM_ACTIVITY = 10
 } TRITONSERVER_InferenceTraceActivity;
 
 /// Get the string representation of a trace activity. The returned
@@ -838,6 +839,18 @@ TRITONSERVER_InferenceTraceTensorNew(
     TRITONSERVER_InferenceTraceTensorActivityFn_t tensor_activity_fn,
     TRITONSERVER_InferenceTraceReleaseFn_t release_fn, void* trace_userp);
 
+/// Report a trace activity. All the traces reported using this API will be
+/// using TRITONSERVER_TRACE_CUSTOM_ACTIVITY type.
+///
+/// \param trace The trace object.
+/// \param timestamp The timestamp associated with the trace activity.
+/// \param name The trace activity name.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceTraceReportActivity(
+    TRITONSERVER_InferenceTrace* trace, uint64_t timestamp,
+    const char* activity_name);
+
 /// Delete a trace object.
 ///
 /// \param trace The trace object.
@@ -921,7 +934,6 @@ TRITONSERVER_DECLSPEC struct TRITONSERVER_Error*
 TRITONSERVER_InferenceTraceSetContext(
     struct TRITONSERVER_InferenceTrace* trace, const char* trace_context);
 
-
 /// Get TRITONSERVER_InferenceTrace context.
 ///
 /// \param trace The trace.
diff --git a/python/test/test_api.py b/python/test/test_api.py
index 75ca9391e..93124c470 100644
--- a/python/test/test_api.py
+++ b/python/test/test_api.py
@@ -345,11 +345,6 @@ def test_ready(self):
         server = tritonserver.Server(self._server_options).start()
         self.assertTrue(server.ready())
 
-    @pytest.mark.xfail(
-        tritonserver.__version__ <= "2.48.0",
-        reason="Known issue on stop: Exit timeout expired. Exiting immediately",
-        raises=tritonserver.InternalError,
-    )
     def test_stop(self):
         server = tritonserver.Server(self._server_options).start(wait_until_ready=True)
 
diff --git a/python/tritonserver/_c/tritonserver_pybind.cc b/python/tritonserver/_c/tritonserver_pybind.cc
index 127bb15b8..6017b3d7e 100644
--- a/python/tritonserver/_c/tritonserver_pybind.cc
+++ b/python/tritonserver/_c/tritonserver_pybind.cc
@@ -1,4 +1,4 @@
-// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -1434,7 +1434,18 @@ class PyServer : public PyWrapper<struct TRITONSERVER_Server> {
     owned_ = true;
   }
 
-  void Stop() const { ThrowIfError(TRITONSERVER_ServerStop(triton_object_)); }
+  void Stop() const
+  {
+    // ServerStop is blocking for the duration of the server exit timeout, so
+    // ensure to release the GIL. This can allow request release callbacks
+    // to be interleaved while server is waiting for live requests/models
+    // to complete. Without releasing GIL, this function may acquire the GIL
+    // first and block the Triton request from being released/freed, thus
+    // blocking the server's shutdown in a circular manner thinking a model is
+    // still alive.
+    py::gil_scoped_release release;
+    ThrowIfError(TRITONSERVER_ServerStop(triton_object_));
+  }
 
   void RegisterModelRepository(
       const std::string& repository_path,
diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
index c6832a3a1..1aa8a9c48 100644
--- a/src/backend_model_instance.cc
+++ b/src/backend_model_instance.cc
@@ -36,6 +36,7 @@
 #include "backend_config.h"
 #include "backend_model.h"
 #include "cuda_utils.h"
+#include "infer_stats.h"
 #include "metrics.h"
 #include "model_config.pb.h"
 #include "numa_utils.h"
@@ -558,7 +559,8 @@ TritonModelInstance::PrepareRequestsOrRespond(
   // If any errors occurred, respond with error for each request.
   if (!status.IsOk()) {
     for (auto& r : requests) {
-      InferenceRequest::RespondIfError(r, status, true /* release_requests */);
+      InferenceRequest::RespondIfError(
+          r, status, true /* release_requests */, FailureReason::OTHER);
     }
     // Log a single error for batch of requests for better visibility
     LOG_STATUS_ERROR(status, "Requests failed pre-execution checks");
@@ -685,7 +687,16 @@ TritonModelInstance::Execute(
     for (TRITONBACKEND_Request* tr : triton_requests) {
       std::unique_ptr<InferenceRequest> ur(
           reinterpret_cast<InferenceRequest*>(tr));
-      InferenceRequest::RespondIfError(ur, status, true /* release_requests */);
+      // NOTE: If a backend both returns an error in
+      // TRITONBACKEND_ModelInstanceExecute and reports an error with
+      // TRITONBACKEND_ModelInstanceReportStatistics, this can result in double
+      // counting of the failure metric for the same request. However, it is
+      // currently not expected for this to be a common case, as the return
+      // value of TRITONBACKEND_ModelInstanceExecute is used to express
+      // ownership of the request rather than success of an inference request.
+      // See tritonbackend.h for more details on this.
+      InferenceRequest::RespondIfError(
+          ur, status, true /* release_requests */, FailureReason::BACKEND);
     }
 
     TRITONSERVER_ErrorDelete(err);
diff --git a/src/dynamic_batch_scheduler.cc b/src/dynamic_batch_scheduler.cc
index ac7aa1276..b5f8ac825 100644
--- a/src/dynamic_batch_scheduler.cc
+++ b/src/dynamic_batch_scheduler.cc
@@ -50,11 +50,12 @@ IsStaleState(Payload::State payload_state)
 void
 FinishSkippedRequests(
     std::vector<std::deque<std::unique_ptr<InferenceRequest>>>&& requests,
-    const Status& response_status)
+    const Status& response_status, FailureReason reason)
 {
   for (auto& queue : requests) {
     for (auto& request : queue) {
-      InferenceRequest::RespondIfError(request, response_status, true);
+      InferenceRequest::RespondIfError(
+          request, response_status, true /* release_requests */, reason);
     }
   }
 }
@@ -69,8 +70,10 @@ FinishRejectedCancelledRequests(
   const static Status rejected_status =
       Status(Status::Code::UNAVAILABLE, "Request timeout expired");
   const static Status cancelled_status = Status(Status::Code::CANCELLED);
-  FinishSkippedRequests(std::move(rejected_requests), rejected_status);
-  FinishSkippedRequests(std::move(cancelled_requests), cancelled_status);
+  FinishSkippedRequests(
+      std::move(rejected_requests), rejected_status, FailureReason::REJECTED);
+  FinishSkippedRequests(
+      std::move(cancelled_requests), cancelled_status, FailureReason::CANCELED);
 }
 
 DynamicBatchScheduler::DynamicBatchScheduler(
diff --git a/src/ensemble_scheduler/ensemble_scheduler.cc b/src/ensemble_scheduler/ensemble_scheduler.cc
index a16044062..b16567dd7 100644
--- a/src/ensemble_scheduler/ensemble_scheduler.cc
+++ b/src/ensemble_scheduler/ensemble_scheduler.cc
@@ -81,23 +81,26 @@ class RequestTracker {
     std::lock_guard<std::mutex> lk(mtx_);
     inflight_request_counter_--;
     if (inflight_request_counter_ == 0) {
+      if (request_ != nullptr) {
 #ifdef TRITON_ENABLE_STATS
-      const auto& infer_stats = context_stats_aggregator_.ImmutableInferStats();
-      request_->ReportStatisticsWithDuration(
-          metric_reporter_, status_.IsOk(), compute_start_ns_,
-          infer_stats.compute_input_duration_ns_,
-          infer_stats.compute_infer_duration_ns_,
-          infer_stats.compute_output_duration_ns_);
-      if (status_.IsOk()) {
-        stats_aggregator_->UpdateInferBatchStatsWithDuration(
-            metric_reporter_, std::max(1U, request_->BatchSize()),
+        const auto& infer_stats =
+            context_stats_aggregator_.ImmutableInferStats();
+        request_->ReportStatisticsWithDuration(
+            metric_reporter_, status_.IsOk(), compute_start_ns_,
             infer_stats.compute_input_duration_ns_,
             infer_stats.compute_infer_duration_ns_,
             infer_stats.compute_output_duration_ns_);
-      }
+        if (status_.IsOk()) {
+          stats_aggregator_->UpdateInferBatchStatsWithDuration(
+              metric_reporter_, std::max(1U, request_->BatchSize()),
+              infer_stats.compute_input_duration_ns_,
+              infer_stats.compute_infer_duration_ns_,
+              infer_stats.compute_output_duration_ns_);
+        }
 #endif
-      InferenceRequest::Release(
-          std::move(request_), TRITONSERVER_REQUEST_RELEASE_ALL);
+        InferenceRequest::Release(
+            std::move(request_), TRITONSERVER_REQUEST_RELEASE_ALL);
+      }
     }
     return (inflight_request_counter_ == 0);
   }
@@ -1136,7 +1139,8 @@ EnsembleContext::FinishEnsemble(std::unique_ptr<InferenceResponse>&& response)
                 "more "
                 "ensemble steps can be made");
         InferenceRequest::RespondIfError(
-            request_tracker_->Request(), ensemble_status_);
+            request_tracker_->Request(), ensemble_status_,
+            false /* release_requests */, FailureReason::OTHER);
       } else {
         request_tracker_->Request()->ResponseFactory()->SendFlags(
             TRITONSERVER_RESPONSE_COMPLETE_FINAL);
@@ -1149,7 +1153,8 @@ EnsembleContext::FinishEnsemble(std::unique_ptr<InferenceResponse>&& response)
           ensemble_status_);
     } else {
       InferenceRequest::RespondIfError(
-          request_tracker_->Request(), ensemble_status_);
+          request_tracker_->Request(), ensemble_status_,
+          false /* release_requests */, FailureReason::OTHER);
     }
   }
 
diff --git a/src/infer_request.cc b/src/infer_request.cc
index 1ad567075..3d93de31d 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -421,10 +421,25 @@ InferenceRequest::Run(std::unique_ptr<InferenceRequest>& request)
   return status;
 }
 
+FailureReason
+stringToFailureReason(const std::string& error_type)
+{
+  if (error_type == "REJECTED") {
+    return FailureReason::REJECTED;
+  }
+  if (error_type == "CANCELED") {
+    return FailureReason::CANCELED;
+  }
+  if (error_type == "BACKEND") {
+    return FailureReason::BACKEND;
+  }
+  return FailureReason::OTHER;
+}
+
 void
 InferenceRequest::RespondIfError(
     std::unique_ptr<InferenceRequest>& request, const Status& status,
-    const bool release_request)
+    const bool release_request, FailureReason reason)
 {
   if (status.IsOk()) {
     return;
@@ -442,7 +457,10 @@ InferenceRequest::RespondIfError(
       InferenceResponse::SendWithStatus(
           std::move(response), TRITONSERVER_RESPONSE_COMPLETE_FINAL, status),
       (request->LogRequest() + "failed to send error response").c_str());
-
+#ifdef TRITON_ENABLE_STATS
+  request->ReportErrorStatistics(
+      request->model_raw_->MetricReporter().get(), reason);
+#endif
   // If releasing the request then invoke the release callback which
   // gives ownership to the callback. So can't access 'request' after
   // this point.
@@ -452,20 +470,6 @@ InferenceRequest::RespondIfError(
   }
 }
 
-void
-InferenceRequest::RespondIfError(
-    std::vector<std::unique_ptr<InferenceRequest>>& requests,
-    const Status& status, const bool release_requests)
-{
-  if (status.IsOk()) {
-    return;
-  }
-
-  for (auto& request : requests) {
-    RespondIfError(request, status, release_requests);
-  }
-}
-
 Status
 InferenceRequest::Release(
     std::unique_ptr<InferenceRequest>&& request, const uint32_t release_flags)
@@ -1389,6 +1393,21 @@ InferenceRequest::ValidateBytesInputs(
 }
 
 #ifdef TRITON_ENABLE_STATS
+
+void
+InferenceRequest::ReportErrorStatistics(
+    MetricModelReporter* metric_reporter, FailureReason reason)
+{
+  INFER_STATS_DECL_TIMESTAMP(request_end_ns);
+  model_raw_->MutableStatsAggregator()->UpdateFailure(
+      metric_reporter, request_start_ns_, request_end_ns, reason);
+  if (secondary_stats_aggregator_ != nullptr) {
+    secondary_stats_aggregator_->UpdateFailure(
+        nullptr /* metric_reporter */, request_start_ns_, request_end_ns,
+        reason);
+  }
+}
+
 void
 InferenceRequest::ReportStatistics(
     MetricModelReporter* metric_reporter, bool success,
@@ -1425,10 +1444,12 @@ InferenceRequest::ReportStatistics(
     }
   } else {
     model_raw_->MutableStatsAggregator()->UpdateFailure(
-        metric_reporter, request_start_ns_, request_end_ns);
+        metric_reporter, request_start_ns_, request_end_ns,
+        FailureReason::BACKEND);
     if (secondary_stats_aggregator_ != nullptr) {
       secondary_stats_aggregator_->UpdateFailure(
-          nullptr /* metric_reporter */, request_start_ns_, request_end_ns);
+          nullptr /* metric_reporter */, request_start_ns_, request_end_ns,
+          FailureReason::BACKEND);
     }
   }
 }
@@ -1461,10 +1482,12 @@ InferenceRequest::ReportStatisticsWithDuration(
     }
   } else {
     model_raw_->MutableStatsAggregator()->UpdateFailure(
-        metric_reporter, request_start_ns_, request_end_ns);
+        metric_reporter, request_start_ns_, request_end_ns,
+        FailureReason::OTHER);
     if (secondary_stats_aggregator_ != nullptr) {
       secondary_stats_aggregator_->UpdateFailure(
-          nullptr /* metric_reporter */, request_start_ns_, request_end_ns);
+          nullptr /* metric_reporter */, request_start_ns_, request_end_ns,
+          FailureReason::OTHER);
     }
   }
 }
@@ -1868,5 +1891,4 @@ operator!=(
 {
   return !(lhs == rhs);
 }
-
 }}  // namespace triton::core
diff --git a/src/infer_request.h b/src/infer_request.h
index ec30d9ca3..a38b141af 100644
--- a/src/infer_request.h
+++ b/src/infer_request.h
@@ -590,7 +590,8 @@ class InferenceRequest {
   // 'release_request' is true 'request' is returned as nullptr.
   static void RespondIfError(
       std::unique_ptr<InferenceRequest>& request, const Status& status,
-      const bool release_request = false);
+      const bool release_request = false,
+      FailureReason reason = FailureReason::OTHER);
 
   // Send an error response to a set of 'requests'. If 'status' is
   // Success then no responses are sent and the requests are not
@@ -603,7 +604,8 @@ class InferenceRequest {
   // returned with all nullptrs.
   static void RespondIfError(
       std::vector<std::unique_ptr<InferenceRequest>>& requests,
-      const Status& status, const bool release_requests = false);
+      const Status& status, const bool release_requests = false,
+      FailureReason reason = FailureReason::OTHER);
 
   // Release the request. Call the release callback and transfer
   // ownership of the request to the callback. On return 'request' is
@@ -673,6 +675,16 @@ class InferenceRequest {
       const uint64_t compute_start_ns, const uint64_t compute_input_end_ns,
       const uint64_t compute_output_start_ns, const uint64_t compute_end_ns);
 
+  // Report the error statistics to stats collectors associated with the
+  // request.
+  // FIXME: A separate function may not be necessary here, but is being used
+  // cautiously in case of unforeseen issues such as possibly capturing a trace
+  // twice. This should be revisited and better tested to see if the
+  // ReportStatistics function can be used as-is for the newly captured failure
+  // cases.
+  void ReportErrorStatistics(
+      MetricModelReporter* metric_reporter, FailureReason reason);
+
   // Report the statistics to stats collectors associated with the request.
   // Duration and timestamps provide two granularities for stats collectors.
   void ReportStatisticsWithDuration(
diff --git a/src/infer_stats.cc b/src/infer_stats.cc
index 68cf70a0c..47ab309cb 100644
--- a/src/infer_stats.cc
+++ b/src/infer_stats.cc
@@ -36,10 +36,28 @@ namespace triton { namespace core {
 
 #ifdef TRITON_ENABLE_STATS
 
+// This function converts FailureReason enum values to std::string
+std::string
+failureReasonToString(FailureReason reason)
+{
+  switch (reason) {
+    case FailureReason::REJECTED:
+      return "REJECTED";
+    case FailureReason::CANCELED:
+      return "CANCELED";
+    case FailureReason::BACKEND:
+      return "BACKEND";
+    case FailureReason::OTHER:
+      return "OTHER";
+    default:
+      return "OTHER";
+  }
+}
+
 void
 InferenceStatsAggregator::UpdateFailure(
     MetricModelReporter* metric_reporter, const uint64_t request_start_ns,
-    const uint64_t request_end_ns)
+    const uint64_t request_end_ns, FailureReason reason)
 {
   std::lock_guard<std::mutex> lock(mu_);
 
@@ -48,7 +66,8 @@ InferenceStatsAggregator::UpdateFailure(
 
 #ifdef TRITON_ENABLE_METRICS
   if (metric_reporter != nullptr) {
-    metric_reporter->IncrementCounter("inf_failure", 1);
+    std::string reason_str = failureReasonToString(reason);
+    metric_reporter->IncrementCounter("inf_failure_" + reason_str, 1);
   }
 #endif  // TRITON_ENABLE_METRICS
 }
diff --git a/src/infer_stats.h b/src/infer_stats.h
index 66b3659bd..15b173f45 100644
--- a/src/infer_stats.h
+++ b/src/infer_stats.h
@@ -39,6 +39,9 @@
 
 namespace triton { namespace core {
 
+// Define the FailureReason enum within the triton::core namespace
+enum class FailureReason { REJECTED, CANCELED, BACKEND, OTHER };
+
 class MetricModelReporter;
 
 
@@ -136,7 +139,7 @@ class InferenceStatsAggregator {
   // Add durations to Infer stats for a failed inference request.
   void UpdateFailure(
       MetricModelReporter* metric_reporter, const uint64_t request_start_ns,
-      const uint64_t request_end_ns);
+      const uint64_t request_end_ns, FailureReason reason);
 
   // Add durations to infer stats for a successful inference request.
   void UpdateSuccess(
diff --git a/src/infer_trace.cc b/src/infer_trace.cc
index cce46e262..4301e2c5b 100644
--- a/src/infer_trace.cc
+++ b/src/infer_trace.cc
@@ -26,6 +26,13 @@
 
 #include "infer_trace.h"
 
+#define TRITONJSON_STATUSTYPE triton::core::Status
+#define TRITONJSON_STATUSRETURN(M) \
+  return triton::core::Status(triton::core::Status::Code::INTERNAL, (M))
+#define TRITONJSON_STATUSSUCCESS triton::core::Status::Success
+#include "triton/common/logging.h"
+#include "triton/common/triton_json.h"
+
 namespace triton { namespace core {
 
 #ifdef TRITON_ENABLE_TRACING
@@ -48,6 +55,26 @@ InferenceTrace::Release()
   release_fn_(reinterpret_cast<TRITONSERVER_InferenceTrace*>(this), userp_);
 }
 
+void
+InferenceTrace::RecordActivityName(
+    uint64_t timestamp_ns, std::string activity_name)
+{
+  std::lock_guard<std::mutex> lock(mu_);
+  triton::common::TritonJson::Value context_json(
+      triton::common::TritonJson::ValueType::OBJECT);
+  if (!context_.empty()) {
+    Status status = context_json.Parse(context_);
+    if (!status.IsOk()) {
+      LOG_ERROR << "Error parsing trace context";
+    }
+  }
+  std::string key = std::to_string(timestamp_ns);
+  context_json.SetStringObject(key.c_str(), activity_name);
+  triton::common::TritonJson::WriteBuffer buffer;
+  context_json.Write(&buffer);
+  context_ = buffer.Contents();
+}
+
 std::shared_ptr<InferenceTraceProxy>
 InferenceTraceProxy::SpawnChildTrace()
 {
diff --git a/src/infer_trace.h b/src/infer_trace.h
index 4de6df788..4f16cf380 100644
--- a/src/infer_trace.h
+++ b/src/infer_trace.h
@@ -28,6 +28,7 @@
 #include <atomic>
 #include <chrono>
 #include <memory>
+#include <mutex>
 
 #include "constants.h"
 #include "status.h"
@@ -69,12 +70,17 @@ class InferenceTrace {
   void SetModelVersion(int64_t v) { model_version_ = v; }
   void SetRequestId(const std::string& request_id) { request_id_ = request_id; }
   void SetContext(const std::string& context) { context_ = context; }
+  void RecordActivityName(uint64_t timestamp_ns, std::string activity_name);
 
   // Report trace activity.
   void Report(
-      const TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns)
+      const TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
+      std::string activity_name = "")
   {
     if ((level_ & TRITONSERVER_TRACE_LEVEL_TIMESTAMPS) > 0) {
+      if (activity == TRITONSERVER_TRACE_CUSTOM_ACTIVITY) {
+        RecordActivityName(timestamp_ns, activity_name);
+      }
       activity_fn_(
           reinterpret_cast<TRITONSERVER_InferenceTrace*>(this), activity,
           timestamp_ns, userp_);
@@ -82,13 +88,15 @@ class InferenceTrace {
   }
 
   // Report trace activity at the current time.
-  void ReportNow(const TRITONSERVER_InferenceTraceActivity activity)
+  void ReportNow(
+      const TRITONSERVER_InferenceTraceActivity activity,
+      std::string activity_name = "")
   {
     if ((level_ & TRITONSERVER_TRACE_LEVEL_TIMESTAMPS) > 0) {
-      Report(
-          activity, std::chrono::duration_cast<std::chrono::nanoseconds>(
-                        std::chrono::steady_clock::now().time_since_epoch())
-                        .count());
+      auto now = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                     std::chrono::steady_clock::now().time_since_epoch())
+                     .count();
+      Report(activity, now, activity_name);
     }
   }
 
@@ -128,6 +136,7 @@ class InferenceTrace {
   // across traces
   static std::atomic<uint64_t> next_id_;
   std::string context_;
+  std::mutex mu_;
 };
 
 //
@@ -152,6 +161,10 @@ class InferenceTraceProxy {
   void SetRequestId(const std::string& n) { trace_->SetRequestId(n); }
   void SetModelVersion(int64_t v) { trace_->SetModelVersion(v); }
   void SetContext(const std::string& context) { trace_->SetContext(context); }
+  void RecordActivityName(uint64_t timestamp_ns, std::string activity_name)
+  {
+    trace_->RecordActivityName(timestamp_ns, activity_name);
+  }
 
   void Report(
       const TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns)
diff --git a/src/metric_model_reporter.cc b/src/metric_model_reporter.cc
index be43da844..9dd9122be 100644
--- a/src/metric_model_reporter.cc
+++ b/src/metric_model_reporter.cc
@@ -29,6 +29,7 @@
 #ifdef TRITON_ENABLE_METRICS
 
 #include "constants.h"
+#include "infer_stats.h"
 #include "triton/common/logging.h"
 
 // Global config group has 'name' of empty string.
@@ -101,6 +102,13 @@ MetricReporterConfig::ParseQuantiles(std::string options)
 //
 // MetricModelReporter
 //
+const std::map<FailureReason, std::string>
+    MetricModelReporter::failure_reasons_map = {
+        {FailureReason::REJECTED, "REJECTED"},
+        {FailureReason::CANCELED, "CANCELED"},
+        {FailureReason::BACKEND, "BACKEND"},
+        {FailureReason::OTHER, "OTHER"}};
+
 Status
 MetricModelReporter::Create(
     const ModelIdentifier& model_id, const int64_t model_version,
@@ -189,7 +197,6 @@ MetricModelReporter::InitializeCounters(
 {
   // Always setup these counters, regardless of config
   counter_families_["inf_success"] = &Metrics::FamilyInferenceSuccess();
-  counter_families_["inf_failure"] = &Metrics::FamilyInferenceFailure();
   counter_families_["inf_count"] = &Metrics::FamilyInferenceCount();
   counter_families_["inf_exec_count"] =
       &Metrics::FamilyInferenceExecutionCount();
@@ -227,6 +234,15 @@ MetricModelReporter::InitializeCounters(
       counters_[name] = CreateMetric<prometheus::Counter>(*family_ptr, labels);
     }
   }
+
+  // Initialize failure metrics with reasons
+  for (const auto& reason_pair : failure_reasons_map) {
+    std::map<std::string, std::string> extended_labels = labels;
+    extended_labels["reason"] = reason_pair.second;
+    counters_["inf_failure_" + reason_pair.second] =
+        CreateMetric<prometheus::Counter>(
+            Metrics::FamilyInferenceFailure(), extended_labels);
+  }
 }
 
 void
diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h
index 5e2e073cf..9378905ae 100644
--- a/src/metric_model_reporter.h
+++ b/src/metric_model_reporter.h
@@ -94,6 +94,8 @@ class MetricModelReporter {
   // Lookup summary metric by name, and observe the value if it exists.
   void ObserveSummary(const std::string& name, double value);
 
+  static const std::map<FailureReason, std::string> failure_reasons_map;
+
  private:
   MetricModelReporter(
       const ModelIdentifier& model_id, const int64_t model_version,
diff --git a/src/sequence_batch_scheduler/sequence_batch_scheduler.cc b/src/sequence_batch_scheduler/sequence_batch_scheduler.cc
index cf3691959..74314e7ab 100644
--- a/src/sequence_batch_scheduler/sequence_batch_scheduler.cc
+++ b/src/sequence_batch_scheduler/sequence_batch_scheduler.cc
@@ -1,4 +1,4 @@
-// Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -91,7 +91,9 @@ CancelRequests(std::vector<std::unique_ptr<InferenceRequest>>&& requests)
       LOG_ERROR << status.Message();
     }
     // Respond the request as cancelled.
-    InferenceRequest::RespondIfError(req, cancelled_status, true);
+    InferenceRequest::RespondIfError(
+        req, cancelled_status, true /* release_requests */,
+        FailureReason::CANCELED);
   }
 }
 
@@ -1173,7 +1175,9 @@ SequenceBatchScheduler::ReaperThread(const int nice)
           "timeout of the corresponding sequence has been expired");
       for (auto& backlog : expired_backlogs) {
         for (auto& req : *backlog->queue_) {
-          InferenceRequest::RespondIfError(req, rejected_status, true);
+          InferenceRequest::RespondIfError(
+              req, rejected_status, true /* release_requests */,
+              FailureReason::REJECTED);
         }
       }
     }
diff --git a/src/test/async_work_queue_test.cc b/src/test/async_work_queue_test.cc
index 1d7b3be90..735086363 100644
--- a/src/test/async_work_queue_test.cc
+++ b/src/test/async_work_queue_test.cc
@@ -91,7 +91,6 @@ TEST_F(AsyncWorkQueueTest, WorkerCountInitialized)
       << "Expect 4 worker count for initialized queue";
 }
 
-
 TEST_F(AsyncWorkQueueTest, RunTasksInParallel)
 {
   auto AddTwoFn = [](const std::vector<int>& lhs, const std::vector<int>& rhs,
@@ -181,10 +180,11 @@ TEST_F(AsyncWorkQueueTest, RunTasksInParallel)
             .count();
 
     parallelized_duration = end_ts - start_ts;
-    // FIXME manual testing shows parallelized time is between 30% to 33.3% for
-    // 128 M total elements
-    EXPECT_LT(parallelized_duration, serialized_duration / 3)
-        << "Expected parallelized work was completed within 1/3 of serialized "
+    // FIXME manual testing shows parallelized time is between 30% to 33.3%
+    // for 128 M total elements, but is flaky in CI so relax the comparison
+    // for parallel to simply be faster than serial by any amount.
+    EXPECT_LT(parallelized_duration, serialized_duration)
+        << "Expected parallelized work was completed faster than serialized "
            "time";
     for (size_t count = 0; count < task_count; count++) {
       auto res = std::move(fs[count].get());
diff --git a/src/tritonserver.cc b/src/tritonserver.cc
index eae83ef2f..82642d5dc 100644
--- a/src/tritonserver.cc
+++ b/src/tritonserver.cc
@@ -950,6 +950,8 @@ TRITONSERVER_InferenceTraceActivityString(
       return "TENSOR_BACKEND_INPUT";
     case TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT:
       return "TENSOR_BACKEND_OUTPUT";
+    case TRITONSERVER_TRACE_CUSTOM_ACTIVITY:
+      return "CUSTOM_ACTIVITY";
   }
 
   return "<unknown>";
@@ -1115,6 +1117,23 @@ TRITONSERVER_InferenceTraceSpawnChildTrace(
 #endif  // TRITON_ENABLE_TRACING
 }
 
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceTraceReportActivity(
+    TRITONSERVER_InferenceTrace* trace, uint64_t timestamp,
+    const char* activity_name)
+{
+#ifdef TRITON_ENABLE_TRACING
+  tc::InferenceTrace* ltrace = reinterpret_cast<tc::InferenceTrace*>(trace);
+  if (trace != nullptr) {
+    ltrace->Report(
+        TRITONSERVER_TRACE_CUSTOM_ACTIVITY, timestamp, activity_name);
+  }
+  return nullptr;  // Success
+#else
+  return TRITONSERVER_ErrorNew(
+      TRITONSERVER_ERROR_UNSUPPORTED, "inference tracing not supported");
+#endif  // TRITON_ENABLE_TRACING
+}
 
 TRITONAPI_DECLSPEC TRITONSERVER_Error*
 TRITONSERVER_InferenceTraceSetContext(
diff --git a/src/tritonserver_stub.cc b/src/tritonserver_stub.cc
index b8449d1bc..cd1e03e15 100644
--- a/src/tritonserver_stub.cc
+++ b/src/tritonserver_stub.cc
@@ -1115,6 +1115,11 @@ TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup()
 {
 }
 
+TRITONAPI_DECLSPEC void
+TRITONSERVER_InferenceTraceReportActivity()
+{
+}
+
 TRITONAPI_DECLSPEC void
 TRITONBACKEND_BackendAttributeSetParallelModelInstanceLoading()
 {