From b9231d8dd2d96a8bd9a56984b5a0a06e7c507844 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Fri, 11 Oct 2024 17:25:55 -0700
Subject: [PATCH 01/10] Add histogram support and new TTFT metric

---
 src/backend_model_instance.cc                |  2 +-
 src/ensemble_scheduler/ensemble_scheduler.cc |  5 +-
 src/infer_response.cc                        | 53 ++++++++++++++--
 src/infer_response.h                         | 35 ++++++++++-
 src/metric_model_reporter.cc                 | 65 ++++++++++++++++++--
 src/metric_model_reporter.h                  | 17 ++++-
 src/metrics.cc                               |  8 ++-
 src/metrics.h                                |  8 +++
 src/model.cc                                 |  2 +-
 src/model.h                                  |  5 ++
 src/test/response_cache_test.cc              | 25 ++++++--
 11 files changed, 198 insertions(+), 27 deletions(-)

diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
index 1aa8a9c48..ba35e2fbd 100644
--- a/src/backend_model_instance.cc
+++ b/src/backend_model_instance.cc
@@ -192,7 +192,7 @@ TritonModelInstance::TritonModelInstance(
         model_->Server()->ResponseCacheEnabled();
     MetricModelReporter::Create(
         model_->ModelId(), model_->Version(), id, response_cache_enabled,
-        model_->Config().metric_tags(), &reporter_);
+        model_->isDecoupled(), model_->Config().metric_tags(), &reporter_);
   }
 #endif  // TRITON_ENABLE_METRICS
 }
diff --git a/src/ensemble_scheduler/ensemble_scheduler.cc b/src/ensemble_scheduler/ensemble_scheduler.cc
index b16567dd7..8cd8be5a6 100644
--- a/src/ensemble_scheduler/ensemble_scheduler.cc
+++ b/src/ensemble_scheduler/ensemble_scheduler.cc
@@ -1469,12 +1469,13 @@ EnsembleScheduler::EnsembleScheduler(
   }
 #endif  // TRITON_ENABLE_GPU
 
+  const bool is_decoupled = config.model_transaction_policy().decoupled();
 #ifdef TRITON_ENABLE_METRICS
   if (Metrics::Enabled()) {
     // Ensemble scheduler doesn't currently support response cache at top level.
     MetricModelReporter::Create(
         model_id, 1 /* model_version */, METRIC_REPORTER_ID_CPU,
-        false /* response_cache_enabled */, config.metric_tags(),
+        false /* response_cache_enabled */, is_decoupled, config.metric_tags(),
         &metric_reporter_);
   }
 #endif  // TRITON_ENABLE_METRICS
@@ -1485,7 +1486,7 @@ EnsembleScheduler::EnsembleScheduler(
   info_->ensemble_name_ = config.name();
 
   // This config field is filled internally for ensemble models
-  info_->is_decoupled_ = config.model_transaction_policy().decoupled();
+  info_->is_decoupled_ = is_decoupled;
 
   // field to check if response cache enabled in the ensemble model config.
   info_->is_cache_enabled_ =
diff --git a/src/infer_response.cc b/src/infer_response.cc
index 6eec9d1a3..3c2b39e17 100644
--- a/src/infer_response.cc
+++ b/src/infer_response.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -38,14 +38,20 @@ namespace triton { namespace core {
 //
 Status
 InferenceResponseFactory::CreateResponse(
-    std::unique_ptr<InferenceResponse>* response) const
+    std::unique_ptr<InferenceResponse>* response)
 {
   response->reset(new InferenceResponse(
       model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_,
-      response_delegator_));
+      response_delegator_, response_cnt_
+#ifdef TRITON_ENABLE_METRICS
+      ,
+      infer_start_ns_
+#endif  // TRITON_ENABLE_METRICS
+      ));
 #ifdef TRITON_ENABLE_TRACING
   (*response)->SetTrace(trace_);
 #endif  // TRITON_ENABLE_TRACING
+  response_cnt_++;
   return Status::Success;
 }
 
@@ -72,10 +78,20 @@ InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp,
     const std::function<
-        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator)
+        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
+    uint64_t seq_num
+#ifdef TRITON_ENABLE_METRICS
+    ,
+    uint64_t infer_start_ns
+#endif  // TRITON_ENABLE_METRICS
+    )
     : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
       response_fn_(response_fn), response_userp_(response_userp),
-      response_delegator_(delegator), null_response_(false)
+      response_delegator_(delegator), seq_num_(seq_num),
+#ifdef TRITON_ENABLE_METRICS
+      infer_start_ns_(infer_start_ns),
+#endif  // TRITON_ENABLE_METRICS
+      null_response_(false)
 {
   // If the allocator has a start_fn then invoke it.
   TRITONSERVER_ResponseAllocatorStartFn_t start_fn = allocator_->StartFn();
@@ -92,7 +108,10 @@ InferenceResponse::InferenceResponse(
 InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp)
-    : response_fn_(response_fn), response_userp_(response_userp),
+    : response_fn_(response_fn), response_userp_(response_userp), seq_num_(0),
+#ifdef TRITON_ENABLE_METRICS
+      infer_start_ns_(0),
+#endif  // TRITON_ENABLE_METRICS
       null_response_(true)
 {
 }
@@ -214,6 +233,10 @@ InferenceResponse::Send(
       TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT, "InferenceResponse Send");
 #endif  // TRITON_ENABLE_TRACING
 
+#ifdef TRITON_ENABLE_METRICS
+  response->UpdateResponseMetrics();
+#endif  // TRITON_ENABLE_METRICS
+
   if (response->response_delegator_ != nullptr) {
     auto ldelegator = std::move(response->response_delegator_);
     ldelegator(std::move(response), flags);
@@ -282,6 +305,24 @@ InferenceResponse::TraceOutputTensors(
 }
 #endif  // TRITON_ENABLE_TRACING
 
+#ifdef TRITON_ENABLE_METRICS
+void
+InferenceResponse::UpdateResponseMetrics() const
+{
+  if (model_ != nullptr && seq_num_ == 0) {
+    auto first_response_ns =
+        std::chrono::duration_cast<std::chrono::nanoseconds>(
+            std::chrono::steady_clock::now().time_since_epoch())
+            .count();
+    if (auto reporter = model_->MetricReporter()) {
+      reporter->ObserveHistogram(
+          "first_response_histogram",
+          (first_response_ns - infer_start_ns_) / 1000000);
+    }
+  }
+}
+#endif  // TRITON_ENABLE_METRICS
+
 //
 // InferenceResponse::Output
 //
diff --git a/src/infer_response.h b/src/infer_response.h
index 612f8c1fe..88b158ab1 100644
--- a/src/infer_response.h
+++ b/src/infer_response.h
@@ -60,12 +60,17 @@ class InferenceResponseFactory {
       : model_(model), id_(id), allocator_(allocator),
         alloc_userp_(alloc_userp), response_fn_(response_fn),
         response_userp_(response_userp), response_delegator_(delegator),
-        is_cancelled_(false)
+        is_cancelled_(false), response_cnt_(0)
 #ifdef TRITON_ENABLE_STATS
         ,
         response_stats_index_(0)
 #endif  // TRITON_ENABLE_STATS
   {
+#ifdef TRITON_ENABLE_METRICS
+    infer_start_ns_ = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                          std::chrono::steady_clock::now().time_since_epoch())
+                          .count();
+#endif  // TRITON_ENABLE_METRICS
   }
 
   void Cancel() { is_cancelled_ = true; }
@@ -84,7 +89,7 @@ class InferenceResponseFactory {
   }
 
   // Create a new response.
-  Status CreateResponse(std::unique_ptr<InferenceResponse>* response) const;
+  Status CreateResponse(std::unique_ptr<InferenceResponse>* response);
 
   // Send a "null" response with 'flags'.
   Status SendFlags(const uint32_t flags) const;
@@ -134,6 +139,14 @@ class InferenceResponseFactory {
 
   std::atomic<bool> is_cancelled_;
 
+  // The number of responses created by this factory.
+  std::atomic<uint64_t> response_cnt_;
+
+#ifdef TRITON_ENABLE_METRICS
+  // The start time of associate request in ns.
+  uint64_t infer_start_ns_;
+#endif  // TRITON_ENABLE_METRICS
+
 #ifdef TRITON_ENABLE_TRACING
   // Inference trace associated with this response.
   std::shared_ptr<InferenceTraceProxy> trace_;
@@ -247,7 +260,13 @@ class InferenceResponse {
       TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
       void* response_userp,
       const std::function<void(
-          std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator);
+          std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
+      uint64_t seq_num
+#ifdef TRITON_ENABLE_METRICS
+      ,
+      uint64_t infer_start_ns
+#endif  // TRITON_ENABLE_METRICS
+  );
 
   // "null" InferenceResponse is a special instance of InferenceResponse which
   // contains minimal information for calling InferenceResponse::Send,
@@ -324,6 +343,11 @@ class InferenceResponse {
       TRITONSERVER_InferenceTraceActivity activity, const std::string& msg);
 #endif  // TRITON_ENABLE_TRACING
 
+
+#ifdef TRITON_ENABLE_METRICS
+  void UpdateResponseMetrics() const;
+#endif  // TRITON_ENABLE_METRICS
+
   // The model associated with this factory. For normal
   // requests/responses this will always be defined and acts to keep
   // the model loaded as long as this factory is live. It may be
@@ -358,6 +382,11 @@ class InferenceResponse {
   std::function<void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>
       response_delegator_;
 
+  const uint64_t seq_num_;
+#ifdef TRITON_ENABLE_METRICS
+  const uint64_t infer_start_ns_;
+#endif  // TRITON_ENABLE_METRICS
+
   bool null_response_;
 
 #ifdef TRITON_ENABLE_TRACING
diff --git a/src/metric_model_reporter.cc b/src/metric_model_reporter.cc
index 9dd9122be..75d2e87a5 100644
--- a/src/metric_model_reporter.cc
+++ b/src/metric_model_reporter.cc
@@ -41,7 +41,8 @@ namespace triton { namespace core {
 // MetricReporterConfig
 //
 void
-MetricReporterConfig::ParseConfig(bool response_cache_enabled)
+MetricReporterConfig::ParseConfig(
+    bool response_cache_enabled, bool is_decoupled)
 {
   // Global config only for now in config map
   auto metrics_config_map = Metrics::ConfigMap();
@@ -53,6 +54,10 @@ MetricReporterConfig::ParseConfig(bool response_cache_enabled)
       latency_counters_enabled_ = false;
     }
 
+    if (pair.first == "histogram_latencies" && pair.second == "false") {
+      latency_histograms_enabled_ = false;
+    }
+
     if (pair.first == "summary_latencies" && pair.second == "true") {
       latency_summaries_enabled_ = true;
     }
@@ -68,6 +73,7 @@ MetricReporterConfig::ParseConfig(bool response_cache_enabled)
 
   // Set flag to signal to stats aggregator if caching is enabled or not
   cache_enabled_ = response_cache_enabled;
+  is_decoupled_ = is_decoupled;
 }
 
 prometheus::Summary::Quantiles
@@ -112,7 +118,7 @@ const std::map<FailureReason, std::string>
 Status
 MetricModelReporter::Create(
     const ModelIdentifier& model_id, const int64_t model_version,
-    const int device, bool response_cache_enabled,
+    const int device, bool response_cache_enabled, bool is_decoupled,
     const triton::common::MetricTagsMap& model_tags,
     std::shared_ptr<MetricModelReporter>* metric_model_reporter)
 {
@@ -141,25 +147,27 @@ MetricModelReporter::Create(
   }
 
   metric_model_reporter->reset(new MetricModelReporter(
-      model_id, model_version, device, response_cache_enabled, model_tags));
+      model_id, model_version, device, response_cache_enabled, is_decoupled,
+      model_tags));
   reporter_map.insert({hash_labels, *metric_model_reporter});
   return Status::Success;
 }
 
 MetricModelReporter::MetricModelReporter(
     const ModelIdentifier& model_id, const int64_t model_version,
-    const int device, bool response_cache_enabled,
+    const int device, bool response_cache_enabled, bool is_decoupled,
     const triton::common::MetricTagsMap& model_tags)
 {
   std::map<std::string, std::string> labels;
   GetMetricLabels(&labels, model_id, model_version, device, model_tags);
 
   // Parse metrics config to control metric setup and behavior
-  config_.ParseConfig(response_cache_enabled);
+  config_.ParseConfig(response_cache_enabled, is_decoupled);
 
   // Initialize families and metrics
   InitializeCounters(labels);
   InitializeGauges(labels);
+  InitializeHistograms(labels);
   InitializeSummaries(labels);
 }
 
@@ -182,6 +190,14 @@ MetricModelReporter::~MetricModelReporter()
     }
   }
 
+  for (auto& iter : histogram_families_) {
+    const auto& name = iter.first;
+    auto family_ptr = iter.second;
+    if (family_ptr) {
+      family_ptr->Remove(histograms_[name]);
+    }
+  }
+
   for (auto& iter : summary_families_) {
     const auto& name = iter.first;
     auto family_ptr = iter.second;
@@ -261,6 +277,28 @@ MetricModelReporter::InitializeGauges(
   }
 }
 
+void
+MetricModelReporter::InitializeHistograms(
+    const std::map<std::string, std::string>& labels)
+{
+  // Only create response metrics if decoupled model to reduce metric output
+  if (config_.latency_histograms_enabled_) {
+    if (config_.is_decoupled_) {
+      histogram_families_["first_response_histogram"] =
+          &Metrics::FamilyFirstResponseDuration();
+    }
+  }
+
+  for (auto& iter : histogram_families_) {
+    const auto& name = iter.first;
+    auto family_ptr = iter.second;
+    if (family_ptr) {
+      histograms_[name] = CreateMetric<prometheus::Histogram>(
+          *family_ptr, labels, config_.buckets_);
+    }
+  }
+}
+
 void
 MetricModelReporter::InitializeSummaries(
     const std::map<std::string, std::string>& labels)
@@ -398,6 +436,23 @@ MetricModelReporter::DecrementGauge(const std::string& name, double value)
   IncrementGauge(name, -1 * value);
 }
 
+void
+MetricModelReporter::ObserveHistogram(const std::string& name, double value)
+{
+  auto iter = histograms_.find(name);
+  if (iter == histograms_.end()) {
+    // No histogram metric exists with this name
+    return;
+  }
+
+  auto histogram = iter->second;
+  if (!histogram) {
+    // histogram is uninitialized/nullptr
+    return;
+  }
+  histogram->Observe(value);
+}
+
 void
 MetricModelReporter::ObserveSummary(const std::string& name, double value)
 {
diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h
index 9378905ae..5ab9f0201 100644
--- a/src/metric_model_reporter.h
+++ b/src/metric_model_reporter.h
@@ -46,25 +46,30 @@ struct ModelIdentifier;
 struct MetricReporterConfig {
 #ifdef TRITON_ENABLE_METRICS
   // Parses Metrics::ConfigMap and sets fields if specified
-  void ParseConfig(bool response_cache_enabled);
+  void ParseConfig(bool response_cache_enabled, bool is_decoupled);
   // Parses pairs of quantiles "quantile1:error1, quantile2:error2, ..."
   // and overwrites quantiles_ field if successful.
   prometheus::Summary::Quantiles ParseQuantiles(std::string options);
 
   // Create and use Counters for per-model latency related metrics
   bool latency_counters_enabled_ = true;
+  // Create and use Histograms for per-model latency related metrics
+  bool latency_histograms_enabled_ = true;
   // Create and use Summaries for per-model latency related metrics
   bool latency_summaries_enabled_ = false;
   // Quantiles used for any summary metrics. Each pair of values represents
   // { quantile, error }. For example, {0.90, 0.01} means to compute the
   // 90th percentile with 1% error on either side, so the approximate 90th
   // percentile value will be between the 89th and 91st percentiles.
+  prometheus::Histogram::BucketBoundaries buckets_ = {10, 100, 500, 1000};
   prometheus::Summary::Quantiles quantiles_ = {
       {0.5, 0.05}, {0.9, 0.01}, {0.95, 0.001}, {0.99, 0.001}, {0.999, 0.001}};
 
   // Whether this reporter's model has caching enabled or not.
   // This helps handle infer_stats aggregation for summaries on cache misses.
   bool cache_enabled_ = false;
+
+  bool is_decoupled_ = false;
 #endif  // TRITON_ENABLE_METRICS
 };
 
@@ -77,7 +82,7 @@ class MetricModelReporter {
   static Status Create(
       const triton::core::ModelIdentifier& model_id,
       const int64_t model_version, const int device,
-      bool response_cache_enabled,
+      bool response_cache_enabled, bool is_decoupled,
       const triton::common::MetricTagsMap& model_tags,
       std::shared_ptr<MetricModelReporter>* metric_model_reporter);
 
@@ -91,6 +96,8 @@ class MetricModelReporter {
   void IncrementGauge(const std::string& name, double value);
   // Decrease gauge by value.
   void DecrementGauge(const std::string& name, double value);
+  // Lookup histogram metric by name, and observe the value if it exists.
+  void ObserveHistogram(const std::string& name, double value);
   // Lookup summary metric by name, and observe the value if it exists.
   void ObserveSummary(const std::string& name, double value);
 
@@ -99,7 +106,7 @@ class MetricModelReporter {
  private:
   MetricModelReporter(
       const ModelIdentifier& model_id, const int64_t model_version,
-      const int device, bool response_cache_enabled,
+      const int device, bool response_cache_enabled, bool is_decoupled,
       const triton::common::MetricTagsMap& model_tags);
 
   static void GetMetricLabels(
@@ -114,6 +121,7 @@ class MetricModelReporter {
 
   void InitializeCounters(const std::map<std::string, std::string>& labels);
   void InitializeGauges(const std::map<std::string, std::string>& labels);
+  void InitializeHistograms(const std::map<std::string, std::string>& labels);
   void InitializeSummaries(const std::map<std::string, std::string>& labels);
 
   // Lookup gauge metric by name. Return gauge if found, nullptr otherwise.
@@ -125,12 +133,15 @@ class MetricModelReporter {
       counter_families_;
   std::unordered_map<std::string, prometheus::Family<prometheus::Gauge>*>
       gauge_families_;
+  std::unordered_map<std::string, prometheus::Family<prometheus::Histogram>*>
+      histogram_families_;
   std::unordered_map<std::string, prometheus::Family<prometheus::Summary>*>
       summary_families_;
 
   // Metrics
   std::unordered_map<std::string, prometheus::Counter*> counters_;
   std::unordered_map<std::string, prometheus::Gauge*> gauges_;
+  std::unordered_map<std::string, prometheus::Histogram*> histograms_;
   std::unordered_map<std::string, prometheus::Summary*> summaries_;
 
   // Config
diff --git a/src/metrics.cc b/src/metrics.cc
index 28ab921ca..3922dbccb 100644
--- a/src/metrics.cc
+++ b/src/metrics.cc
@@ -1,4 +1,4 @@
-// Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -109,6 +109,12 @@ Metrics::Metrics()
                     "execution per-model.")
               .Register(*registry_)),
 
+      inf_first_response_histogram_ms_family_(
+          prometheus::BuildHistogram()
+              .Name("nv_inference_first_response_histogram_ms")
+              .Help("Duration from request to first response in milliseconds.")
+              .Register(*registry_)),
+
       pinned_memory_pool_total_family_(
           prometheus::BuildGauge()
               .Name("nv_pinned_memory_pool_total_bytes")
diff --git a/src/metrics.h b/src/metrics.h
index 6d08ad168..833bab290 100644
--- a/src/metrics.h
+++ b/src/metrics.h
@@ -215,6 +215,12 @@ class Metrics {
     return GetSingleton()->inf_pending_request_count_family_;
   }
 
+  static prometheus::Family<prometheus::Histogram>&
+  FamilyFirstResponseDuration()
+  {
+    return GetSingleton()->inf_first_response_histogram_ms_family_;
+  }
+
   // Metric families of per-model response cache metrics
   // NOTE: These are used in infer_stats for perf_analyzer
   static prometheus::Family<prometheus::Counter>& FamilyCacheHitCount()
@@ -300,6 +306,8 @@ class Metrics {
   prometheus::Family<prometheus::Counter>&
       inf_compute_output_duration_us_family_;
   prometheus::Family<prometheus::Gauge>& inf_pending_request_count_family_;
+  prometheus::Family<prometheus::Histogram>&
+      inf_first_response_histogram_ms_family_;
 
   prometheus::Family<prometheus::Gauge>& pinned_memory_pool_total_family_;
   prometheus::Family<prometheus::Gauge>& pinned_memory_pool_used_family_;
diff --git a/src/model.cc b/src/model.cc
index 9da281ed5..d15ca42d6 100644
--- a/src/model.cc
+++ b/src/model.cc
@@ -135,7 +135,7 @@ Model::Init(const bool is_config_provided)
 #ifdef TRITON_ENABLE_METRICS
   MetricModelReporter::Create(
       ModelId(), Version(), METRIC_REPORTER_ID_UTILITY, ResponseCacheEnabled(),
-      Config().metric_tags(), &reporter_);
+      isDecoupled(), Config().metric_tags(), &reporter_);
 #endif  // TRITON_ENABLE_METRICS
 
   return Status::Success;
diff --git a/src/model.h b/src/model.h
index 286ca8db1..2374a81b3 100644
--- a/src/model.h
+++ b/src/model.h
@@ -148,6 +148,11 @@ class Model {
     return config_.response_cache().enable();
   }
 
+  bool isDecoupled() const
+  {
+    return config_.model_transaction_policy().decoupled();
+  }
+
   // Get the number of required inputs
   size_t RequiredInputCount() const { return required_input_count_; }
 
diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc
index 8ffb85bd6..00a1826d8 100644
--- a/src/test/response_cache_test.cc
+++ b/src/test/response_cache_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -42,11 +42,16 @@ namespace triton { namespace core {
 //
 Status
 InferenceResponseFactory::CreateResponse(
-    std::unique_ptr<InferenceResponse>* response) const
+    std::unique_ptr<InferenceResponse>* response)
 {
   response->reset(new InferenceResponse(
       model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_,
-      response_delegator_));
+      response_delegator_, response_cnt_
+#ifdef TRITON_ENABLE_METRICS
+      ,
+      infer_start_ns_
+#endif  // TRITON_ENABLE_METRICS
+      ));
 
   return Status::Success;
 }
@@ -181,10 +186,20 @@ InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp,
     const std::function<
-        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator)
+        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
+    uint64_t seq_num
+#ifdef TRITON_ENABLE_METRICS
+    ,
+    uint64_t infer_start_ns
+#endif  // TRITON_ENABLE_METRICS
+    )
     : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
       response_fn_(response_fn), response_userp_(response_userp),
-      response_delegator_(delegator), null_response_(false)
+      response_delegator_(delegator), seq_num_(seq_num),
+#ifdef TRITON_ENABLE_METRICS
+      infer_start_ns_(infer_start_ns),
+#endif  // TRITON_ENABLE_METRICS
+      null_response_(false)
 {
   // Skip allocator logic / references in unit test
 }

From 8c9fe0c9d9eac48114ca5055eeb05eb3041e087f Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Mon, 14 Oct 2024 14:40:47 -0700
Subject: [PATCH 02/10] Minor update

---
 src/backend_model_instance.cc | 2 +-
 src/metrics.cc                | 2 +-
 src/model.cc                  | 2 +-
 src/model.h                   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
index ba35e2fbd..87b602c82 100644
--- a/src/backend_model_instance.cc
+++ b/src/backend_model_instance.cc
@@ -192,7 +192,7 @@ TritonModelInstance::TritonModelInstance(
         model_->Server()->ResponseCacheEnabled();
     MetricModelReporter::Create(
         model_->ModelId(), model_->Version(), id, response_cache_enabled,
-        model_->isDecoupled(), model_->Config().metric_tags(), &reporter_);
+        model_->IsDecoupled(), model_->Config().metric_tags(), &reporter_);
   }
 #endif  // TRITON_ENABLE_METRICS
 }
diff --git a/src/metrics.cc b/src/metrics.cc
index 3922dbccb..eb0d17280 100644
--- a/src/metrics.cc
+++ b/src/metrics.cc
@@ -112,7 +112,7 @@ Metrics::Metrics()
       inf_first_response_histogram_ms_family_(
           prometheus::BuildHistogram()
               .Name("nv_inference_first_response_histogram_ms")
-              .Help("Duration from request to first response in milliseconds.")
+              .Help("Duration from request to first response in milliseconds")
               .Register(*registry_)),
 
       pinned_memory_pool_total_family_(
diff --git a/src/model.cc b/src/model.cc
index d15ca42d6..37fb87de2 100644
--- a/src/model.cc
+++ b/src/model.cc
@@ -135,7 +135,7 @@ Model::Init(const bool is_config_provided)
 #ifdef TRITON_ENABLE_METRICS
   MetricModelReporter::Create(
       ModelId(), Version(), METRIC_REPORTER_ID_UTILITY, ResponseCacheEnabled(),
-      isDecoupled(), Config().metric_tags(), &reporter_);
+      IsDecoupled(), Config().metric_tags(), &reporter_);
 #endif  // TRITON_ENABLE_METRICS
 
   return Status::Success;
diff --git a/src/model.h b/src/model.h
index 2374a81b3..4781020af 100644
--- a/src/model.h
+++ b/src/model.h
@@ -148,7 +148,7 @@ class Model {
     return config_.response_cache().enable();
   }
 
-  bool isDecoupled() const
+  bool IsDecoupled() const
   {
     return config_.model_transaction_policy().decoupled();
   }

From db990d308ccfbbd06d10c9e4a6eca73a1becfa47 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Mon, 14 Oct 2024 19:01:03 -0700
Subject: [PATCH 03/10] Reuse and rename response_stats_index_

---
 src/backend_model_instance.cc   |  3 +--
 src/infer_response.cc           | 26 +++++++++++++++++++-------
 src/infer_response.h            | 27 +++++++++++++++------------
 src/test/response_cache_test.cc | 18 ++++++++++++++----
 4 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
index 87b602c82..feb20115e 100644
--- a/src/backend_model_instance.cc
+++ b/src/backend_model_instance.cc
@@ -1103,8 +1103,7 @@ TRITONBACKEND_ModelInstanceReportResponseStatistics(
 
   InferenceStatsAggregator* sa =
       rs->model_instance->Model()->MutableStatsAggregator();
-  std::string key =
-      std::to_string((*rs->response_factory)->GetAndIncrementResponseIndex());
+  std::string key = std::to_string((*rs->response_factory)->GetResponseIndex());
 
   if (rs->error == nullptr) {
     if (rs->compute_output_start > 0) {
diff --git a/src/infer_response.cc b/src/infer_response.cc
index 3c2b39e17..5d5603bd5 100644
--- a/src/infer_response.cc
+++ b/src/infer_response.cc
@@ -42,7 +42,11 @@ InferenceResponseFactory::CreateResponse(
 {
   response->reset(new InferenceResponse(
       model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_,
-      response_delegator_, response_cnt_
+      response_delegator_
+#ifdef TRITON_ENABLE_STATS
+      ,
+      response_index_++
+#endif  // TRITON_ENABLE_STATS
 #ifdef TRITON_ENABLE_METRICS
       ,
       infer_start_ns_
@@ -51,7 +55,6 @@ InferenceResponseFactory::CreateResponse(
 #ifdef TRITON_ENABLE_TRACING
   (*response)->SetTrace(trace_);
 #endif  // TRITON_ENABLE_TRACING
-  response_cnt_++;
   return Status::Success;
 }
 
@@ -78,8 +81,11 @@ InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp,
     const std::function<
-        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
-    uint64_t seq_num
+        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
+#ifdef TRITON_ENABLE_STATS
+    ,
+    uint64_t index
+#endif  // TRITON_ENABLE_STATS
 #ifdef TRITON_ENABLE_METRICS
     ,
     uint64_t infer_start_ns
@@ -87,7 +93,10 @@ InferenceResponse::InferenceResponse(
     )
     : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
       response_fn_(response_fn), response_userp_(response_userp),
-      response_delegator_(delegator), seq_num_(seq_num),
+      response_delegator_(delegator),
+#ifdef TRITON_ENABLE_STATS
+      index_(index),
+#endif  // TRITON_ENABLE_STATS
 #ifdef TRITON_ENABLE_METRICS
       infer_start_ns_(infer_start_ns),
 #endif  // TRITON_ENABLE_METRICS
@@ -108,7 +117,10 @@ InferenceResponse::InferenceResponse(
 InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp)
-    : response_fn_(response_fn), response_userp_(response_userp), seq_num_(0),
+    : response_fn_(response_fn), response_userp_(response_userp),
+#ifdef TRITON_ENABLE_STATS
+      index_(0),
+#endif  // TRITON_ENABLE_STATS
 #ifdef TRITON_ENABLE_METRICS
       infer_start_ns_(0),
 #endif  // TRITON_ENABLE_METRICS
@@ -309,7 +321,7 @@ InferenceResponse::TraceOutputTensors(
 void
 InferenceResponse::UpdateResponseMetrics() const
 {
-  if (model_ != nullptr && seq_num_ == 0) {
+  if (model_ != nullptr && index_ == 0) {
     auto first_response_ns =
         std::chrono::duration_cast<std::chrono::nanoseconds>(
             std::chrono::steady_clock::now().time_since_epoch())
diff --git a/src/infer_response.h b/src/infer_response.h
index 88b158ab1..ca2a9d4db 100644
--- a/src/infer_response.h
+++ b/src/infer_response.h
@@ -60,10 +60,10 @@ class InferenceResponseFactory {
       : model_(model), id_(id), allocator_(allocator),
         alloc_userp_(alloc_userp), response_fn_(response_fn),
         response_userp_(response_userp), response_delegator_(delegator),
-        is_cancelled_(false), response_cnt_(0)
+        is_cancelled_(false)
 #ifdef TRITON_ENABLE_STATS
         ,
-        response_stats_index_(0)
+        response_index_(0)
 #endif  // TRITON_ENABLE_STATS
   {
 #ifdef TRITON_ENABLE_METRICS
@@ -104,8 +104,8 @@ class InferenceResponseFactory {
 #endif  // TRITON_ENABLE_TRACING
 
 #ifdef TRITON_ENABLE_STATS
-  // Return the current response statistics index and increment it.
-  uint64_t GetAndIncrementResponseIndex() { return response_stats_index_++; };
+  // Return the current response index.
+  uint64_t GetResponseIndex() { return response_index_; };
 #endif  // TRITON_ENABLE_STATS
 
  private:
@@ -139,9 +139,6 @@ class InferenceResponseFactory {
 
   std::atomic<bool> is_cancelled_;
 
-  // The number of responses created by this factory.
-  std::atomic<uint64_t> response_cnt_;
-
 #ifdef TRITON_ENABLE_METRICS
   // The start time of associate request in ns.
   uint64_t infer_start_ns_;
@@ -154,7 +151,7 @@ class InferenceResponseFactory {
 
 #ifdef TRITON_ENABLE_STATS
   // Number of response statistics reported.
-  std::atomic<uint64_t> response_stats_index_;
+  std::atomic<uint64_t> response_index_;
 #endif  // TRITON_ENABLE_STATS
 };
 
@@ -259,9 +256,12 @@ class InferenceResponse {
       const ResponseAllocator* allocator, void* alloc_userp,
       TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
       void* response_userp,
-      const std::function<void(
-          std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
-      uint64_t seq_num
+      const std::function<
+          void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
+#ifdef TRITON_ENABLE_STATS
+      ,
+      uint64_t index
+#endif  // TRITON_ENABLE_STATS
 #ifdef TRITON_ENABLE_METRICS
       ,
       uint64_t infer_start_ns
@@ -382,7 +382,10 @@ class InferenceResponse {
   std::function<void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>
       response_delegator_;
 
-  const uint64_t seq_num_;
+#ifdef TRITON_ENABLE_STATS
+  const uint64_t index_;
+#endif  // TRITON_ENABLE_STATS
+
 #ifdef TRITON_ENABLE_METRICS
   const uint64_t infer_start_ns_;
 #endif  // TRITON_ENABLE_METRICS
diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc
index 00a1826d8..801bcef32 100644
--- a/src/test/response_cache_test.cc
+++ b/src/test/response_cache_test.cc
@@ -46,7 +46,11 @@ InferenceResponseFactory::CreateResponse(
 {
   response->reset(new InferenceResponse(
       model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_,
-      response_delegator_, response_cnt_
+      response_delegator_
+#ifdef TRITON_ENABLE_STATS
+      ,
+      response_index_
+#endif  // TRITON_ENABLE_STATS
 #ifdef TRITON_ENABLE_METRICS
       ,
       infer_start_ns_
@@ -186,8 +190,11 @@ InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp,
     const std::function<
-        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
-    uint64_t seq_num
+        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
+#ifdef TRITON_ENABLE_STATS
+    ,
+    uint64_t index
+#endif  // TRITON_ENABLE_STATS
 #ifdef TRITON_ENABLE_METRICS
     ,
     uint64_t infer_start_ns
@@ -195,7 +202,10 @@ InferenceResponse::InferenceResponse(
     )
     : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
       response_fn_(response_fn), response_userp_(response_userp),
-      response_delegator_(delegator), seq_num_(seq_num),
+      response_delegator_(delegator),
+#ifdef TRITON_ENABLE_STATS
+      index_(index),
+#endif  // TRITON_ENABLE_STATS
 #ifdef TRITON_ENABLE_METRICS
       infer_start_ns_(infer_start_ns),
 #endif  // TRITON_ENABLE_METRICS

From 189cf6445f593348daeb323451d2bf5fe72fc870 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Tue, 15 Oct 2024 10:35:25 -0700
Subject: [PATCH 04/10] Revert "Reuse and rename response_stats_index_"

This reverts commit db990d308ccfbbd06d10c9e4a6eca73a1becfa47.
---
 src/backend_model_instance.cc   |  3 ++-
 src/infer_response.cc           | 26 +++++++-------------------
 src/infer_response.h            | 27 ++++++++++++---------------
 src/test/response_cache_test.cc | 18 ++++--------------
 4 files changed, 25 insertions(+), 49 deletions(-)

diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
index feb20115e..87b602c82 100644
--- a/src/backend_model_instance.cc
+++ b/src/backend_model_instance.cc
@@ -1103,7 +1103,8 @@ TRITONBACKEND_ModelInstanceReportResponseStatistics(
 
   InferenceStatsAggregator* sa =
       rs->model_instance->Model()->MutableStatsAggregator();
-  std::string key = std::to_string((*rs->response_factory)->GetResponseIndex());
+  std::string key =
+      std::to_string((*rs->response_factory)->GetAndIncrementResponseIndex());
 
   if (rs->error == nullptr) {
     if (rs->compute_output_start > 0) {
diff --git a/src/infer_response.cc b/src/infer_response.cc
index 5d5603bd5..3c2b39e17 100644
--- a/src/infer_response.cc
+++ b/src/infer_response.cc
@@ -42,11 +42,7 @@ InferenceResponseFactory::CreateResponse(
 {
   response->reset(new InferenceResponse(
       model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_,
-      response_delegator_
-#ifdef TRITON_ENABLE_STATS
-      ,
-      response_index_++
-#endif  // TRITON_ENABLE_STATS
+      response_delegator_, response_cnt_
 #ifdef TRITON_ENABLE_METRICS
       ,
       infer_start_ns_
@@ -55,6 +51,7 @@ InferenceResponseFactory::CreateResponse(
 #ifdef TRITON_ENABLE_TRACING
   (*response)->SetTrace(trace_);
 #endif  // TRITON_ENABLE_TRACING
+  response_cnt_++;
   return Status::Success;
 }
 
@@ -81,11 +78,8 @@ InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp,
     const std::function<
-        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
-#ifdef TRITON_ENABLE_STATS
-    ,
-    uint64_t index
-#endif  // TRITON_ENABLE_STATS
+        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
+    uint64_t seq_num
 #ifdef TRITON_ENABLE_METRICS
     ,
     uint64_t infer_start_ns
@@ -93,10 +87,7 @@ InferenceResponse::InferenceResponse(
     )
     : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
       response_fn_(response_fn), response_userp_(response_userp),
-      response_delegator_(delegator),
-#ifdef TRITON_ENABLE_STATS
-      index_(index),
-#endif  // TRITON_ENABLE_STATS
+      response_delegator_(delegator), seq_num_(seq_num),
 #ifdef TRITON_ENABLE_METRICS
       infer_start_ns_(infer_start_ns),
 #endif  // TRITON_ENABLE_METRICS
@@ -117,10 +108,7 @@ InferenceResponse::InferenceResponse(
 InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp)
-    : response_fn_(response_fn), response_userp_(response_userp),
-#ifdef TRITON_ENABLE_STATS
-      index_(0),
-#endif  // TRITON_ENABLE_STATS
+    : response_fn_(response_fn), response_userp_(response_userp), seq_num_(0),
 #ifdef TRITON_ENABLE_METRICS
       infer_start_ns_(0),
 #endif  // TRITON_ENABLE_METRICS
@@ -321,7 +309,7 @@ InferenceResponse::TraceOutputTensors(
 void
 InferenceResponse::UpdateResponseMetrics() const
 {
-  if (model_ != nullptr && index_ == 0) {
+  if (model_ != nullptr && seq_num_ == 0) {
     auto first_response_ns =
         std::chrono::duration_cast<std::chrono::nanoseconds>(
             std::chrono::steady_clock::now().time_since_epoch())
diff --git a/src/infer_response.h b/src/infer_response.h
index ca2a9d4db..88b158ab1 100644
--- a/src/infer_response.h
+++ b/src/infer_response.h
@@ -60,10 +60,10 @@ class InferenceResponseFactory {
       : model_(model), id_(id), allocator_(allocator),
         alloc_userp_(alloc_userp), response_fn_(response_fn),
         response_userp_(response_userp), response_delegator_(delegator),
-        is_cancelled_(false)
+        is_cancelled_(false), response_cnt_(0)
 #ifdef TRITON_ENABLE_STATS
         ,
-        response_index_(0)
+        response_stats_index_(0)
 #endif  // TRITON_ENABLE_STATS
   {
 #ifdef TRITON_ENABLE_METRICS
@@ -104,8 +104,8 @@ class InferenceResponseFactory {
 #endif  // TRITON_ENABLE_TRACING
 
 #ifdef TRITON_ENABLE_STATS
-  // Return the current response index.
-  uint64_t GetResponseIndex() { return response_index_; };
+  // Return the current response statistics index and increment it.
+  uint64_t GetAndIncrementResponseIndex() { return response_stats_index_++; };
 #endif  // TRITON_ENABLE_STATS
 
  private:
@@ -139,6 +139,9 @@ class InferenceResponseFactory {
 
   std::atomic<bool> is_cancelled_;
 
+  // The number of responses created by this factory.
+  std::atomic<uint64_t> response_cnt_;
+
 #ifdef TRITON_ENABLE_METRICS
   // The start time of associate request in ns.
   uint64_t infer_start_ns_;
@@ -151,7 +154,7 @@ class InferenceResponseFactory {
 
 #ifdef TRITON_ENABLE_STATS
   // Number of response statistics reported.
-  std::atomic<uint64_t> response_index_;
+  std::atomic<uint64_t> response_stats_index_;
 #endif  // TRITON_ENABLE_STATS
 };
 
@@ -256,12 +259,9 @@ class InferenceResponse {
       const ResponseAllocator* allocator, void* alloc_userp,
       TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
       void* response_userp,
-      const std::function<
-          void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
-#ifdef TRITON_ENABLE_STATS
-      ,
-      uint64_t index
-#endif  // TRITON_ENABLE_STATS
+      const std::function<void(
+          std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
+      uint64_t seq_num
 #ifdef TRITON_ENABLE_METRICS
       ,
       uint64_t infer_start_ns
@@ -382,10 +382,7 @@ class InferenceResponse {
   std::function<void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>
       response_delegator_;
 
-#ifdef TRITON_ENABLE_STATS
-  const uint64_t index_;
-#endif  // TRITON_ENABLE_STATS
-
+  const uint64_t seq_num_;
 #ifdef TRITON_ENABLE_METRICS
   const uint64_t infer_start_ns_;
 #endif  // TRITON_ENABLE_METRICS
diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc
index 801bcef32..00a1826d8 100644
--- a/src/test/response_cache_test.cc
+++ b/src/test/response_cache_test.cc
@@ -46,11 +46,7 @@ InferenceResponseFactory::CreateResponse(
 {
   response->reset(new InferenceResponse(
       model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_,
-      response_delegator_
-#ifdef TRITON_ENABLE_STATS
-      ,
-      response_index_
-#endif  // TRITON_ENABLE_STATS
+      response_delegator_, response_cnt_
 #ifdef TRITON_ENABLE_METRICS
       ,
       infer_start_ns_
@@ -190,11 +186,8 @@ InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp,
     const std::function<
-        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
-#ifdef TRITON_ENABLE_STATS
-    ,
-    uint64_t index
-#endif  // TRITON_ENABLE_STATS
+        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
+    uint64_t seq_num
 #ifdef TRITON_ENABLE_METRICS
     ,
     uint64_t infer_start_ns
@@ -202,10 +195,7 @@ InferenceResponse::InferenceResponse(
     )
     : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
       response_fn_(response_fn), response_userp_(response_userp),
-      response_delegator_(delegator),
-#ifdef TRITON_ENABLE_STATS
-      index_(index),
-#endif  // TRITON_ENABLE_STATS
+      response_delegator_(delegator), seq_num_(seq_num),
 #ifdef TRITON_ENABLE_METRICS
       infer_start_ns_(infer_start_ns),
 #endif  // TRITON_ENABLE_METRICS

From 51518d66961d708b4d51a3efed8458a03cbc7e34 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Tue, 15 Oct 2024 14:22:51 -0700
Subject: [PATCH 05/10] Update variable namings

---
 src/infer_response.cc           | 8 ++++----
 src/infer_response.h            | 4 ++--
 src/test/response_cache_test.cc | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/infer_response.cc b/src/infer_response.cc
index 3c2b39e17..71c70b26d 100644
--- a/src/infer_response.cc
+++ b/src/infer_response.cc
@@ -79,7 +79,7 @@ InferenceResponse::InferenceResponse(
     void* response_userp,
     const std::function<
         void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
-    uint64_t seq_num
+    uint64_t seq_idx
 #ifdef TRITON_ENABLE_METRICS
     ,
     uint64_t infer_start_ns
@@ -87,7 +87,7 @@ InferenceResponse::InferenceResponse(
     )
     : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
       response_fn_(response_fn), response_userp_(response_userp),
-      response_delegator_(delegator), seq_num_(seq_num),
+      response_delegator_(delegator), seq_idx_(seq_idx),
 #ifdef TRITON_ENABLE_METRICS
       infer_start_ns_(infer_start_ns),
 #endif  // TRITON_ENABLE_METRICS
@@ -108,7 +108,7 @@ InferenceResponse::InferenceResponse(
 InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp)
-    : response_fn_(response_fn), response_userp_(response_userp), seq_num_(0),
+    : response_fn_(response_fn), response_userp_(response_userp), seq_idx_(0),
 #ifdef TRITON_ENABLE_METRICS
       infer_start_ns_(0),
 #endif  // TRITON_ENABLE_METRICS
@@ -309,7 +309,7 @@ InferenceResponse::TraceOutputTensors(
 void
 InferenceResponse::UpdateResponseMetrics() const
 {
-  if (model_ != nullptr && seq_num_ == 0) {
+  if (model_ != nullptr && seq_idx_ == 0) {
     auto first_response_ns =
         std::chrono::duration_cast<std::chrono::nanoseconds>(
             std::chrono::steady_clock::now().time_since_epoch())
diff --git a/src/infer_response.h b/src/infer_response.h
index 88b158ab1..b19eb9a8f 100644
--- a/src/infer_response.h
+++ b/src/infer_response.h
@@ -261,7 +261,7 @@ class InferenceResponse {
       void* response_userp,
       const std::function<void(
           std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
-      uint64_t seq_num
+      uint64_t seq_idx
 #ifdef TRITON_ENABLE_METRICS
       ,
       uint64_t infer_start_ns
@@ -382,7 +382,7 @@ class InferenceResponse {
   std::function<void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>
       response_delegator_;
 
-  const uint64_t seq_num_;
+  const uint64_t seq_idx_;
 #ifdef TRITON_ENABLE_METRICS
   const uint64_t infer_start_ns_;
 #endif  // TRITON_ENABLE_METRICS
diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc
index 00a1826d8..166989d46 100644
--- a/src/test/response_cache_test.cc
+++ b/src/test/response_cache_test.cc
@@ -187,7 +187,7 @@ InferenceResponse::InferenceResponse(
     void* response_userp,
     const std::function<
         void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
-    uint64_t seq_num
+    uint64_t seq_idx
 #ifdef TRITON_ENABLE_METRICS
     ,
     uint64_t infer_start_ns
@@ -195,7 +195,7 @@ InferenceResponse::InferenceResponse(
     )
     : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
       response_fn_(response_fn), response_userp_(response_userp),
-      response_delegator_(delegator), seq_num_(seq_num),
+      response_delegator_(delegator), seq_idx_(seq_idx),
 #ifdef TRITON_ENABLE_METRICS
       infer_start_ns_(infer_start_ns),
 #endif  // TRITON_ENABLE_METRICS

From da632d1918c41ac4384010be462c325a1d69b519 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Wed, 16 Oct 2024 13:54:30 -0700
Subject: [PATCH 06/10] Fix the logic that finds the first response.

---
 src/infer_response.cc           | 33 ++++++++++++++++-----------------
 src/infer_response.h            | 26 ++++++++++++++++----------
 src/test/response_cache_test.cc | 15 +++++++--------
 3 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/src/infer_response.cc b/src/infer_response.cc
index 71c70b26d..092338d97 100644
--- a/src/infer_response.cc
+++ b/src/infer_response.cc
@@ -38,20 +38,19 @@ namespace triton { namespace core {
 //
 Status
 InferenceResponseFactory::CreateResponse(
-    std::unique_ptr<InferenceResponse>* response)
+    std::unique_ptr<InferenceResponse>* response) const
 {
   response->reset(new InferenceResponse(
       model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_,
-      response_delegator_, response_cnt_
+      response_delegator_
 #ifdef TRITON_ENABLE_METRICS
       ,
-      infer_start_ns_
+      responses_sent_, infer_start_ns_
 #endif  // TRITON_ENABLE_METRICS
       ));
 #ifdef TRITON_ENABLE_TRACING
   (*response)->SetTrace(trace_);
 #endif  // TRITON_ENABLE_TRACING
-  response_cnt_++;
   return Status::Success;
 }
 
@@ -78,18 +77,18 @@ InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp,
     const std::function<
-        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
-    uint64_t seq_idx
+        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
 #ifdef TRITON_ENABLE_METRICS
     ,
+    std::shared_ptr<std::atomic<uint64_t>> responses_sent,
     uint64_t infer_start_ns
 #endif  // TRITON_ENABLE_METRICS
     )
     : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
       response_fn_(response_fn), response_userp_(response_userp),
-      response_delegator_(delegator), seq_idx_(seq_idx),
+      response_delegator_(delegator),
 #ifdef TRITON_ENABLE_METRICS
-      infer_start_ns_(infer_start_ns),
+      responses_sent_(responses_sent), infer_start_ns_(infer_start_ns),
 #endif  // TRITON_ENABLE_METRICS
       null_response_(false)
 {
@@ -108,9 +107,9 @@ InferenceResponse::InferenceResponse(
 InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp)
-    : response_fn_(response_fn), response_userp_(response_userp), seq_idx_(0),
+    : response_fn_(response_fn), response_userp_(response_userp),
 #ifdef TRITON_ENABLE_METRICS
-      infer_start_ns_(0),
+      responses_sent_(nullptr), infer_start_ns_(0),
 #endif  // TRITON_ENABLE_METRICS
       null_response_(true)
 {
@@ -309,15 +308,15 @@ InferenceResponse::TraceOutputTensors(
 void
 InferenceResponse::UpdateResponseMetrics() const
 {
-  if (model_ != nullptr && seq_idx_ == 0) {
-    auto first_response_ns =
-        std::chrono::duration_cast<std::chrono::nanoseconds>(
-            std::chrono::steady_clock::now().time_since_epoch())
-            .count();
+  // Report inference to first response duration.
+  if (model_ != nullptr && responses_sent_ != nullptr &&
+      responses_sent_->fetch_add(1, std::memory_order_relaxed) == 0) {
+    auto now_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                      std::chrono::steady_clock::now().time_since_epoch())
+                      .count();
     if (auto reporter = model_->MetricReporter()) {
       reporter->ObserveHistogram(
-          "first_response_histogram",
-          (first_response_ns - infer_start_ns_) / 1000000);
+          "first_response_histogram", (now_ns - infer_start_ns_) / 1000000);
     }
   }
 }
diff --git a/src/infer_response.h b/src/infer_response.h
index b19eb9a8f..281af4740 100644
--- a/src/infer_response.h
+++ b/src/infer_response.h
@@ -60,7 +60,11 @@ class InferenceResponseFactory {
       : model_(model), id_(id), allocator_(allocator),
         alloc_userp_(alloc_userp), response_fn_(response_fn),
         response_userp_(response_userp), response_delegator_(delegator),
-        is_cancelled_(false), response_cnt_(0)
+        is_cancelled_(false)
+#ifdef TRITON_ENABLE_METRICS
+        ,
+        responses_sent_(0)
+#endif  // TRITON_ENABLE_METRICS
 #ifdef TRITON_ENABLE_STATS
         ,
         response_stats_index_(0)
@@ -89,7 +93,7 @@ class InferenceResponseFactory {
   }
 
   // Create a new response.
-  Status CreateResponse(std::unique_ptr<InferenceResponse>* response);
+  Status CreateResponse(std::unique_ptr<InferenceResponse>* response) const;
 
   // Send a "null" response with 'flags'.
   Status SendFlags(const uint32_t flags) const;
@@ -139,10 +143,10 @@ class InferenceResponseFactory {
 
   std::atomic<bool> is_cancelled_;
 
-  // The number of responses created by this factory.
-  std::atomic<uint64_t> response_cnt_;
-
 #ifdef TRITON_ENABLE_METRICS
+  // Total number of responses sent created by this response factory.
+  std::shared_ptr<std::atomic<uint64_t>> responses_sent_;
+
   // The start time of associate request in ns.
   uint64_t infer_start_ns_;
 #endif  // TRITON_ENABLE_METRICS
@@ -259,11 +263,11 @@ class InferenceResponse {
       const ResponseAllocator* allocator, void* alloc_userp,
       TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
       void* response_userp,
-      const std::function<void(
-          std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
-      uint64_t seq_idx
+      const std::function<
+          void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
 #ifdef TRITON_ENABLE_METRICS
       ,
+      std::shared_ptr<std::atomic<uint64_t>> responses_sent_,
       uint64_t infer_start_ns
 #endif  // TRITON_ENABLE_METRICS
   );
@@ -343,7 +347,6 @@ class InferenceResponse {
       TRITONSERVER_InferenceTraceActivity activity, const std::string& msg);
 #endif  // TRITON_ENABLE_TRACING
 
-
 #ifdef TRITON_ENABLE_METRICS
   void UpdateResponseMetrics() const;
 #endif  // TRITON_ENABLE_METRICS
@@ -382,8 +385,11 @@ class InferenceResponse {
   std::function<void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>
       response_delegator_;
 
-  const uint64_t seq_idx_;
 #ifdef TRITON_ENABLE_METRICS
+  // Total number of responses sent created by its response factory.
+  std::shared_ptr<std::atomic<uint64_t>> responses_sent_;
+
+  // The start time of associate request in ns.
   const uint64_t infer_start_ns_;
 #endif  // TRITON_ENABLE_METRICS
 
diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc
index 166989d46..6d7d35db7 100644
--- a/src/test/response_cache_test.cc
+++ b/src/test/response_cache_test.cc
@@ -42,14 +42,14 @@ namespace triton { namespace core {
 //
 Status
 InferenceResponseFactory::CreateResponse(
-    std::unique_ptr<InferenceResponse>* response)
+    std::unique_ptr<InferenceResponse>* response) const
 {
   response->reset(new InferenceResponse(
       model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_,
-      response_delegator_, response_cnt_
+      response_delegator_
 #ifdef TRITON_ENABLE_METRICS
       ,
-      infer_start_ns_
+      responses_sent_, infer_start_ns_
 #endif  // TRITON_ENABLE_METRICS
       ));
 
@@ -186,18 +186,17 @@ InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp,
     const std::function<
-        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
-    uint64_t seq_idx
+        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
 #ifdef TRITON_ENABLE_METRICS
     ,
-    uint64_t infer_start_ns
+    uint64_t responses_sent, uint64_t infer_start_ns
 #endif  // TRITON_ENABLE_METRICS
     )
     : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
       response_fn_(response_fn), response_userp_(response_userp),
-      response_delegator_(delegator), seq_idx_(seq_idx),
+      response_delegator_(delegator),
 #ifdef TRITON_ENABLE_METRICS
-      infer_start_ns_(infer_start_ns),
+      responses_sent_(responses_sent), infer_start_ns_(infer_start_ns),
 #endif  // TRITON_ENABLE_METRICS
       null_response_(false)
 {

From 7f0612c8984c58459394acdc33a03bc07b4431cc Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Thu, 17 Oct 2024 11:01:21 -0700
Subject: [PATCH 07/10] Fix incorrect initialization of shared_ptr

---
 src/infer_response.cc       | 3 ++-
 src/infer_response.h        | 4 ++--
 src/metric_model_reporter.h | 4 +++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/infer_response.cc b/src/infer_response.cc
index 092338d97..1a3f85175 100644
--- a/src/infer_response.cc
+++ b/src/infer_response.cc
@@ -88,7 +88,8 @@ InferenceResponse::InferenceResponse(
       response_fn_(response_fn), response_userp_(response_userp),
       response_delegator_(delegator),
 #ifdef TRITON_ENABLE_METRICS
-      responses_sent_(responses_sent), infer_start_ns_(infer_start_ns),
+      responses_sent_(std::move(responses_sent)),
+      infer_start_ns_(infer_start_ns),
 #endif  // TRITON_ENABLE_METRICS
       null_response_(false)
 {
diff --git a/src/infer_response.h b/src/infer_response.h
index 281af4740..8d09285ee 100644
--- a/src/infer_response.h
+++ b/src/infer_response.h
@@ -63,7 +63,7 @@ class InferenceResponseFactory {
         is_cancelled_(false)
 #ifdef TRITON_ENABLE_METRICS
         ,
-        responses_sent_(0)
+        responses_sent_(std::make_shared<std::atomic<uint64_t>>(0))
 #endif  // TRITON_ENABLE_METRICS
 #ifdef TRITON_ENABLE_STATS
         ,
@@ -387,7 +387,7 @@ class InferenceResponse {
 
 #ifdef TRITON_ENABLE_METRICS
   // Total number of responses sent created by its response factory.
-  std::shared_ptr<std::atomic<uint64_t>> responses_sent_;
+  const std::shared_ptr<std::atomic<uint64_t>> responses_sent_;
 
   // The start time of associate request in ns.
   const uint64_t infer_start_ns_;
diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h
index 5ab9f0201..faeb5f399 100644
--- a/src/metric_model_reporter.h
+++ b/src/metric_model_reporter.h
@@ -57,11 +57,13 @@ struct MetricReporterConfig {
   bool latency_histograms_enabled_ = true;
   // Create and use Summaries for per-model latency related metrics
   bool latency_summaries_enabled_ = false;
+  // Buckets used for any histogram metrics. Each value represents
+  // a bucket boundary.
+  prometheus::Histogram::BucketBoundaries buckets_ = {100, 500, 2000, 5000};
   // Quantiles used for any summary metrics. Each pair of values represents
   // { quantile, error }. For example, {0.90, 0.01} means to compute the
   // 90th percentile with 1% error on either side, so the approximate 90th
   // percentile value will be between the 89th and 91st percentiles.
-  prometheus::Histogram::BucketBoundaries buckets_ = {10, 100, 500, 1000};
   prometheus::Summary::Quantiles quantiles_ = {
       {0.5, 0.05}, {0.9, 0.01}, {0.95, 0.001}, {0.99, 0.001}, {0.999, 0.001}};
 

From 72d99d7b9901df342cef1edaf3e4389bd67209ad Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Thu, 17 Oct 2024 11:06:31 -0700
Subject: [PATCH 08/10] Disable histograms by default

---
 src/metric_model_reporter.cc | 4 ++--
 src/metric_model_reporter.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/metric_model_reporter.cc b/src/metric_model_reporter.cc
index 75d2e87a5..93de30796 100644
--- a/src/metric_model_reporter.cc
+++ b/src/metric_model_reporter.cc
@@ -54,8 +54,8 @@ MetricReporterConfig::ParseConfig(
       latency_counters_enabled_ = false;
     }
 
-    if (pair.first == "histogram_latencies" && pair.second == "false") {
-      latency_histograms_enabled_ = false;
+    if (pair.first == "histogram_latencies" && pair.second == "true") {
+      latency_histograms_enabled_ = true;
     }
 
     if (pair.first == "summary_latencies" && pair.second == "true") {
diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h
index faeb5f399..6a1c148ce 100644
--- a/src/metric_model_reporter.h
+++ b/src/metric_model_reporter.h
@@ -54,7 +54,7 @@ struct MetricReporterConfig {
   // Create and use Counters for per-model latency related metrics
   bool latency_counters_enabled_ = true;
   // Create and use Histograms for per-model latency related metrics
-  bool latency_histograms_enabled_ = true;
+  bool latency_histograms_enabled_ = false;
   // Create and use Summaries for per-model latency related metrics
   bool latency_summaries_enabled_ = false;
   // Buckets used for any histogram metrics. Each value represents

From b6b5af994d970f57e22c4db6d7834c15030499f2 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Thu, 17 Oct 2024 11:49:18 -0700
Subject: [PATCH 09/10] Minor fixes

---
 src/infer_response.h        | 2 +-
 src/metric_model_reporter.h | 3 ++-
 src/model.h                 | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/infer_response.h b/src/infer_response.h
index 8d09285ee..700d14a3c 100644
--- a/src/infer_response.h
+++ b/src/infer_response.h
@@ -267,7 +267,7 @@ class InferenceResponse {
           void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
 #ifdef TRITON_ENABLE_METRICS
       ,
-      std::shared_ptr<std::atomic<uint64_t>> responses_sent_,
+      std::shared_ptr<std::atomic<uint64_t>> responses_sent,
       uint64_t infer_start_ns
 #endif  // TRITON_ENABLE_METRICS
   );
diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h
index 6a1c148ce..0a3018125 100644
--- a/src/metric_model_reporter.h
+++ b/src/metric_model_reporter.h
@@ -58,7 +58,8 @@ struct MetricReporterConfig {
   // Create and use Summaries for per-model latency related metrics
   bool latency_summaries_enabled_ = false;
   // Buckets used for any histogram metrics. Each value represents
-  // a bucket boundary.
+  // a bucket boundary. For example, {100, 500, 2000, 5000} are latencies
+  // in milliseconds in first_response_histogram.
   prometheus::Histogram::BucketBoundaries buckets_ = {100, 500, 2000, 5000};
   // Quantiles used for any summary metrics. Each pair of values represents
   // { quantile, error }. For example, {0.90, 0.01} means to compute the
diff --git a/src/model.h b/src/model.h
index 4781020af..2acf63b11 100644
--- a/src/model.h
+++ b/src/model.h
@@ -148,6 +148,7 @@ class Model {
     return config_.response_cache().enable();
   }
 
+  // Get whether the model is decoupled.
   bool IsDecoupled() const
   {
     return config_.model_transaction_policy().decoupled();

From fb87d2a80e42cb2881f8162c0e98da09cdf51c47 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Thu, 17 Oct 2024 16:14:04 -0700
Subject: [PATCH 10/10] Minor update

---
 src/infer_response.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/infer_response.cc b/src/infer_response.cc
index 1a3f85175..498036cde 100644
--- a/src/infer_response.cc
+++ b/src/infer_response.cc
@@ -317,7 +317,8 @@ InferenceResponse::UpdateResponseMetrics() const
                       .count();
     if (auto reporter = model_->MetricReporter()) {
       reporter->ObserveHistogram(
-          "first_response_histogram", (now_ns - infer_start_ns_) / 1000000);
+          "first_response_histogram",
+          (now_ns - infer_start_ns_) / NANOS_PER_MILLIS);
     }
   }
 }