triton-inference-server · yinggeh · Oct 23, 2024 · Oct 12, 2024 · Oct 14, 2024 · Oct 15, 2024
diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
@@ -192,7 +192,7 @@ TritonModelInstance::TritonModelInstance(
         model_->Server()->ResponseCacheEnabled();
     MetricModelReporter::Create(
         model_->ModelId(), model_->Version(), id, response_cache_enabled,
-        model_->Config().metric_tags(), &reporter_);
+        model_->IsDecoupled(), model_->Config().metric_tags(), &reporter_);
   }
 #endif  // TRITON_ENABLE_METRICS
 }

diff --git a/src/ensemble_scheduler/ensemble_scheduler.cc b/src/ensemble_scheduler/ensemble_scheduler.cc
@@ -1469,12 +1469,13 @@ EnsembleScheduler::EnsembleScheduler(
   }
 #endif  // TRITON_ENABLE_GPU
 
+  const bool is_decoupled = config.model_transaction_policy().decoupled();
 #ifdef TRITON_ENABLE_METRICS
   if (Metrics::Enabled()) {
     // Ensemble scheduler doesn't currently support response cache at top level.
     MetricModelReporter::Create(
         model_id, 1 /* model_version */, METRIC_REPORTER_ID_CPU,
-        false /* response_cache_enabled */, config.metric_tags(),
+        false /* response_cache_enabled */, is_decoupled, config.metric_tags(),
         &metric_reporter_);
   }
 #endif  // TRITON_ENABLE_METRICS
@@ -1485,7 +1486,7 @@ EnsembleScheduler::EnsembleScheduler(
   info_->ensemble_name_ = config.name();
 
   // This config field is filled internally for ensemble models
-  info_->is_decoupled_ = config.model_transaction_policy().decoupled();
+  info_->is_decoupled_ = is_decoupled;
 
   // field to check if response cache enabled in the ensemble model config.
   info_->is_cache_enabled_ =

diff --git a/src/infer_response.cc b/src/infer_response.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -42,7 +42,12 @@ InferenceResponseFactory::CreateResponse(
 {
   response->reset(new InferenceResponse(
       model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_,
-      response_delegator_));
+      response_delegator_
+#ifdef TRITON_ENABLE_METRICS
+      ,
+      responses_sent_, infer_start_ns_
+#endif  // TRITON_ENABLE_METRICS
+      ));
 #ifdef TRITON_ENABLE_TRACING
   (*response)->SetTrace(trace_);
 #endif  // TRITON_ENABLE_TRACING
@@ -72,10 +77,21 @@ InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp,
     const std::function<
-        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator)
+        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
+#ifdef TRITON_ENABLE_METRICS
+    ,
+    std::shared_ptr<std::atomic<uint64_t>> responses_sent,
+    uint64_t infer_start_ns
+#endif  // TRITON_ENABLE_METRICS
+    )
     : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
       response_fn_(response_fn), response_userp_(response_userp),
-      response_delegator_(delegator), null_response_(false)
+      response_delegator_(delegator),
+#ifdef TRITON_ENABLE_METRICS
+      responses_sent_(std::move(responses_sent)),
+      infer_start_ns_(infer_start_ns),
+#endif  // TRITON_ENABLE_METRICS
+      null_response_(false)
 {
   // If the allocator has a start_fn then invoke it.
   TRITONSERVER_ResponseAllocatorStartFn_t start_fn = allocator_->StartFn();
@@ -93,6 +109,9 @@ InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp)
     : response_fn_(response_fn), response_userp_(response_userp),
+#ifdef TRITON_ENABLE_METRICS
+      responses_sent_(nullptr), infer_start_ns_(0),
+#endif  // TRITON_ENABLE_METRICS
       null_response_(true)
 {
 }
@@ -214,6 +233,10 @@ InferenceResponse::Send(
       TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT, "InferenceResponse Send");
 #endif  // TRITON_ENABLE_TRACING
 
+#ifdef TRITON_ENABLE_METRICS
+  response->UpdateResponseMetrics();
+#endif  // TRITON_ENABLE_METRICS
+
   if (response->response_delegator_ != nullptr) {
     auto ldelegator = std::move(response->response_delegator_);
     ldelegator(std::move(response), flags);
@@ -282,6 +305,24 @@ InferenceResponse::TraceOutputTensors(
 }
 #endif  // TRITON_ENABLE_TRACING
 
+#ifdef TRITON_ENABLE_METRICS
+void
+InferenceResponse::UpdateResponseMetrics() const
+{
+  // Report inference to first response duration.
+  if (model_ != nullptr && responses_sent_ != nullptr &&
+      responses_sent_->fetch_add(1, std::memory_order_relaxed) == 0) {
+    auto now_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                      std::chrono::steady_clock::now().time_since_epoch())
+                      .count();
+    if (auto reporter = model_->MetricReporter()) {
+      reporter->ObserveHistogram(
+          "first_response_histogram", (now_ns - infer_start_ns_) / 1000000);
+    }
+  }
+}
+#endif  // TRITON_ENABLE_METRICS
+
 //
 // InferenceResponse::Output
 //

diff --git a/src/infer_response.h b/src/infer_response.h
@@ -61,11 +61,20 @@ class InferenceResponseFactory {
         alloc_userp_(alloc_userp), response_fn_(response_fn),
         response_userp_(response_userp), response_delegator_(delegator),
         is_cancelled_(false)
+#ifdef TRITON_ENABLE_METRICS
+        ,
+        responses_sent_(std::make_shared<std::atomic<uint64_t>>(0))
+#endif  // TRITON_ENABLE_METRICS
 #ifdef TRITON_ENABLE_STATS
         ,
         response_stats_index_(0)
 #endif  // TRITON_ENABLE_STATS
   {
+#ifdef TRITON_ENABLE_METRICS
+    infer_start_ns_ = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                          std::chrono::steady_clock::now().time_since_epoch())
+                          .count();
+#endif  // TRITON_ENABLE_METRICS
   }
 
   void Cancel() { is_cancelled_ = true; }
@@ -134,6 +143,14 @@ class InferenceResponseFactory {
 
   std::atomic<bool> is_cancelled_;
 
+#ifdef TRITON_ENABLE_METRICS
+  // Total number of responses sent created by this response factory.
+  std::shared_ptr<std::atomic<uint64_t>> responses_sent_;
+
+  // The start time of associate request in ns.
+  uint64_t infer_start_ns_;
+#endif  // TRITON_ENABLE_METRICS
+
 #ifdef TRITON_ENABLE_TRACING
   // Inference trace associated with this response.
   std::shared_ptr<InferenceTraceProxy> trace_;
@@ -246,8 +263,14 @@ class InferenceResponse {
       const ResponseAllocator* allocator, void* alloc_userp,
       TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
       void* response_userp,
-      const std::function<void(
-          std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator);
+      const std::function<
+          void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
+#ifdef TRITON_ENABLE_METRICS
+      ,
+      std::shared_ptr<std::atomic<uint64_t>> responses_sent,
+      uint64_t infer_start_ns
+#endif  // TRITON_ENABLE_METRICS
+  );
 
   // "null" InferenceResponse is a special instance of InferenceResponse which
   // contains minimal information for calling InferenceResponse::Send,
@@ -324,6 +347,10 @@ class InferenceResponse {
       TRITONSERVER_InferenceTraceActivity activity, const std::string& msg);
 #endif  // TRITON_ENABLE_TRACING
 
+#ifdef TRITON_ENABLE_METRICS
+  void UpdateResponseMetrics() const;
+#endif  // TRITON_ENABLE_METRICS
+
   // The model associated with this factory. For normal
   // requests/responses this will always be defined and acts to keep
   // the model loaded as long as this factory is live. It may be
@@ -358,6 +385,14 @@ class InferenceResponse {
   std::function<void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>
       response_delegator_;
 
+#ifdef TRITON_ENABLE_METRICS
+  // Total number of responses sent created by its response factory.
+  const std::shared_ptr<std::atomic<uint64_t>> responses_sent_;
+
+  // The start time of associate request in ns.
+  const uint64_t infer_start_ns_;
+#endif  // TRITON_ENABLE_METRICS
+
   bool null_response_;
 
 #ifdef TRITON_ENABLE_TRACING

diff --git a/src/metric_model_reporter.cc b/src/metric_model_reporter.cc
@@ -41,7 +41,8 @@ namespace triton { namespace core {
 // MetricReporterConfig
 //
 void
-MetricReporterConfig::ParseConfig(bool response_cache_enabled)
+MetricReporterConfig::ParseConfig(
+    bool response_cache_enabled, bool is_decoupled)
 {
   // Global config only for now in config map
   auto metrics_config_map = Metrics::ConfigMap();
@@ -53,6 +54,10 @@ MetricReporterConfig::ParseConfig(bool response_cache_enabled)
       latency_counters_enabled_ = false;
     }
 
+    if (pair.first == "histogram_latencies" && pair.second == "true") {
+      latency_histograms_enabled_ = true;
+    }
+
     if (pair.first == "summary_latencies" && pair.second == "true") {
       latency_summaries_enabled_ = true;
     }
@@ -68,6 +73,7 @@ MetricReporterConfig::ParseConfig(bool response_cache_enabled)
 
   // Set flag to signal to stats aggregator if caching is enabled or not
   cache_enabled_ = response_cache_enabled;
+  is_decoupled_ = is_decoupled;
 }
 
 prometheus::Summary::Quantiles
@@ -112,7 +118,7 @@ const std::map<FailureReason, std::string>
 Status
 MetricModelReporter::Create(
     const ModelIdentifier& model_id, const int64_t model_version,
-    const int device, bool response_cache_enabled,
+    const int device, bool response_cache_enabled, bool is_decoupled,
     const triton::common::MetricTagsMap& model_tags,
     std::shared_ptr<MetricModelReporter>* metric_model_reporter)
 {
@@ -141,25 +147,27 @@ MetricModelReporter::Create(
   }
 
   metric_model_reporter->reset(new MetricModelReporter(
-      model_id, model_version, device, response_cache_enabled, model_tags));
+      model_id, model_version, device, response_cache_enabled, is_decoupled,
+      model_tags));
   reporter_map.insert({hash_labels, *metric_model_reporter});
   return Status::Success;
 }
 
 MetricModelReporter::MetricModelReporter(
     const ModelIdentifier& model_id, const int64_t model_version,
-    const int device, bool response_cache_enabled,
+    const int device, bool response_cache_enabled, bool is_decoupled,
     const triton::common::MetricTagsMap& model_tags)
 {
   std::map<std::string, std::string> labels;
   GetMetricLabels(&labels, model_id, model_version, device, model_tags);
 
   // Parse metrics config to control metric setup and behavior
-  config_.ParseConfig(response_cache_enabled);
+  config_.ParseConfig(response_cache_enabled, is_decoupled);
 
   // Initialize families and metrics
   InitializeCounters(labels);
   InitializeGauges(labels);
+  InitializeHistograms(labels);
   InitializeSummaries(labels);
 }
 
@@ -182,6 +190,14 @@ MetricModelReporter::~MetricModelReporter()
     }
   }
 
+  for (auto& iter : histogram_families_) {
+    const auto& name = iter.first;
+    auto family_ptr = iter.second;
+    if (family_ptr) {
+      family_ptr->Remove(histograms_[name]);
+    }
+  }
+
   for (auto& iter : summary_families_) {
     const auto& name = iter.first;
     auto family_ptr = iter.second;
@@ -261,6 +277,28 @@ MetricModelReporter::InitializeGauges(
   }
 }
 
+void
+MetricModelReporter::InitializeHistograms(
+    const std::map<std::string, std::string>& labels)
+{
+  // Only create response metrics if decoupled model to reduce metric output
+  if (config_.latency_histograms_enabled_) {
+    if (config_.is_decoupled_) {
+      histogram_families_["first_response_histogram"] =
+          &Metrics::FamilyFirstResponseDuration();
+    }
+  }
+
+  for (auto& iter : histogram_families_) {
+    const auto& name = iter.first;
+    auto family_ptr = iter.second;
+    if (family_ptr) {
+      histograms_[name] = CreateMetric<prometheus::Histogram>(
+          *family_ptr, labels, config_.buckets_);
+    }
+  }
+}
+
 void
 MetricModelReporter::InitializeSummaries(
     const std::map<std::string, std::string>& labels)
@@ -398,6 +436,23 @@ MetricModelReporter::DecrementGauge(const std::string& name, double value)
   IncrementGauge(name, -1 * value);
 }
 
+void
+MetricModelReporter::ObserveHistogram(const std::string& name, double value)
+{
+  auto iter = histograms_.find(name);
+  if (iter == histograms_.end()) {
+    // No histogram metric exists with this name
+    return;
+  }
+
+  auto histogram = iter->second;
+  if (!histogram) {
+    // histogram is uninitialized/nullptr
+    return;
+  }
+  histogram->Observe(value);
+}
+
 void
 MetricModelReporter::ObserveSummary(const std::string& name, double value)
 {