From 1d18c2beb8360196ca526f6f05130107c7fb9dcf Mon Sep 17 00:00:00 2001
From: Yingge He <157551214+yinggeh@users.noreply.github.com>
Date: Tue, 5 Nov 2024 17:21:20 -0800
Subject: [PATCH] feat: Per-model customization of histogram metric buckets
 (#405)

---
 src/backend_model_instance.cc                |  3 +-
 src/constants.h                              |  1 +
 src/ensemble_scheduler/ensemble_scheduler.cc |  2 +-
 src/infer_response.cc                        |  2 +-
 src/metric_model_reporter.cc                 | 39 ++++++++++++++----
 src/metric_model_reporter.h                  | 25 +++++++++---
 src/metrics.cc                               | 14 ++++---
 src/metrics.h                                |  6 ++-
 src/model.cc                                 |  3 +-
 src/model_config_utils.cc                    | 43 +++++++++++++++++++-
 10 files changed, 113 insertions(+), 25 deletions(-)

diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
index 87b602c82..080091708 100644
--- a/src/backend_model_instance.cc
+++ b/src/backend_model_instance.cc
@@ -192,7 +192,8 @@ TritonModelInstance::TritonModelInstance(
         model_->Server()->ResponseCacheEnabled();
     MetricModelReporter::Create(
         model_->ModelId(), model_->Version(), id, response_cache_enabled,
-        model_->IsDecoupled(), model_->Config().metric_tags(), &reporter_);
+        model_->IsDecoupled(), model_->Config().metric_tags(),
+        model_->Config().model_metrics(), &reporter_);
   }
 #endif  // TRITON_ENABLE_METRICS
 }
diff --git a/src/constants.h b/src/constants.h
index 21943f5a2..8415f8ee9 100644
--- a/src/constants.h
+++ b/src/constants.h
@@ -86,6 +86,7 @@ constexpr char kInitialStateFolder[] = "initial_state";
 // Metric names
 constexpr char kPendingRequestMetric[] = "inf_pending_request_count";
 constexpr char kModelLoadTimeMetric[] = "model_load_time";
+constexpr char kFirstResponseHistogram[] = "first_response_histogram";
 
 constexpr uint64_t NANOS_PER_SECOND = 1000000000;
 constexpr uint64_t NANOS_PER_MILLIS = 1000000;
diff --git a/src/ensemble_scheduler/ensemble_scheduler.cc b/src/ensemble_scheduler/ensemble_scheduler.cc
index 64f118144..90f1e0be8 100644
--- a/src/ensemble_scheduler/ensemble_scheduler.cc
+++ b/src/ensemble_scheduler/ensemble_scheduler.cc
@@ -1477,7 +1477,7 @@ EnsembleScheduler::EnsembleScheduler(
     MetricModelReporter::Create(
         model_id, 1 /* model_version */, METRIC_REPORTER_ID_CPU,
         false /* response_cache_enabled */, is_decoupled, config.metric_tags(),
-        &metric_reporter_);
+        config.model_metrics(), &metric_reporter_);
   }
 #endif  // TRITON_ENABLE_METRICS
 
diff --git a/src/infer_response.cc b/src/infer_response.cc
index 498036cde..4096e152b 100644
--- a/src/infer_response.cc
+++ b/src/infer_response.cc
@@ -317,7 +317,7 @@ InferenceResponse::UpdateResponseMetrics() const
                       .count();
     if (auto reporter = model_->MetricReporter()) {
       reporter->ObserveHistogram(
-          "first_response_histogram",
+          kFirstResponseHistogram,
           (now_ns - infer_start_ns_) / NANOS_PER_MILLIS);
     }
   }
diff --git a/src/metric_model_reporter.cc b/src/metric_model_reporter.cc
index a10db8f64..24ce0551f 100644
--- a/src/metric_model_reporter.cc
+++ b/src/metric_model_reporter.cc
@@ -42,7 +42,8 @@ namespace triton { namespace core {
 //
 void
 MetricReporterConfig::ParseConfig(
-    bool response_cache_enabled, bool is_decoupled)
+    bool response_cache_enabled, bool is_decoupled,
+    const inference::ModelMetrics& model_metrics)
 {
   // Global config only for now in config map
   auto metrics_config_map = Metrics::ConfigMap();
@@ -74,6 +75,26 @@ MetricReporterConfig::ParseConfig(
   // Set flag to signal to stats aggregator if caching is enabled or not
   cache_enabled_ = response_cache_enabled;
   is_decoupled_ = is_decoupled;
+
+  // Override default histogram options if set in model_metrics.
+  for (const auto& metric_control : model_metrics.metric_control()) {
+    const std::string& family_name =
+        metric_control.metric_identifier().family();
+
+    // If family name exists, override with new options.
+    if (metric_map_.find(family_name) != metric_map_.end()) {
+      // Copy protobuf RepeatedField to std::vector
+      const auto& buckets_proto = metric_control.histogram_options().buckets();
+      const prometheus::Histogram::BucketBoundaries buckets(
+          buckets_proto.begin(), buckets_proto.end());
+      histogram_options_[metric_map_.at(family_name)] = buckets;
+    } else {
+      // metric_control config may be extended to support backend metrics.
+      LOG_WARNING << "Metric family '" << family_name
+                  << "' in 'metric_identifier' is not a customizable metric in "
+                     "Triton core.";
+    }
+  }
 }
 
 prometheus::Summary::Quantiles
@@ -120,6 +141,7 @@ MetricModelReporter::Create(
     const ModelIdentifier& model_id, const int64_t model_version,
     const int device, bool response_cache_enabled, bool is_decoupled,
     const triton::common::MetricTagsMap& model_tags,
+    const inference::ModelMetrics& model_metrics,
     std::shared_ptr<MetricModelReporter>* metric_model_reporter)
 {
   static std::mutex mtx;
@@ -148,7 +170,7 @@ MetricModelReporter::Create(
 
   metric_model_reporter->reset(new MetricModelReporter(
       model_id, model_version, device, response_cache_enabled, is_decoupled,
-      model_tags));
+      model_tags, model_metrics));
   reporter_map.insert({hash_labels, *metric_model_reporter});
   return Status::Success;
 }
@@ -156,13 +178,14 @@ MetricModelReporter::Create(
 MetricModelReporter::MetricModelReporter(
     const ModelIdentifier& model_id, const int64_t model_version,
     const int device, bool response_cache_enabled, bool is_decoupled,
-    const triton::common::MetricTagsMap& model_tags)
+    const triton::common::MetricTagsMap& model_tags,
+    const inference::ModelMetrics& model_metrics)
 {
   std::map<std::string, std::string> labels;
   GetMetricLabels(&labels, model_id, model_version, device, model_tags);
 
   // Parse metrics config to control metric setup and behavior
-  config_.ParseConfig(response_cache_enabled, is_decoupled);
+  config_.ParseConfig(response_cache_enabled, is_decoupled, model_metrics);
 
   // Initialize families and metrics
   InitializeCounters(labels);
@@ -282,10 +305,11 @@ void
 MetricModelReporter::InitializeHistograms(
     const std::map<std::string, std::string>& labels)
 {
+  // Update MetricReporterConfig::metric_map_ for new histograms.
   // Only create response metrics if decoupled model to reduce metric output
   if (config_.latency_histograms_enabled_) {
     if (config_.is_decoupled_) {
-      histogram_families_["first_response_histogram"] =
+      histogram_families_[kFirstResponseHistogram] =
           &Metrics::FamilyFirstResponseDuration();
     }
   }
@@ -294,8 +318,9 @@ MetricModelReporter::InitializeHistograms(
     const auto& name = iter.first;
     auto family_ptr = iter.second;
     if (family_ptr) {
-      histograms_[name] = CreateMetric<prometheus::Histogram>(
-          *family_ptr, labels, config_.buckets_);
+      const auto& buckets = config_.histogram_options_[name];
+      histograms_[name] =
+          CreateMetric<prometheus::Histogram>(*family_ptr, labels, buckets);
     }
   }
 }
diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h
index 236bc8f5f..6cdb9f14f 100644
--- a/src/metric_model_reporter.h
+++ b/src/metric_model_reporter.h
@@ -46,7 +46,9 @@ struct ModelIdentifier;
 struct MetricReporterConfig {
 #ifdef TRITON_ENABLE_METRICS
   // Parses Metrics::ConfigMap and sets fields if specified
-  void ParseConfig(bool response_cache_enabled, bool is_decoupled);
+  void ParseConfig(
+      bool response_cache_enabled, bool is_decoupled,
+      const inference::ModelMetrics& model_metrics);
   // Parses pairs of quantiles "quantile1:error1, quantile2:error2, ..."
   // and overwrites quantiles_ field if successful.
   prometheus::Summary::Quantiles ParseQuantiles(std::string options);
@@ -57,10 +59,12 @@ struct MetricReporterConfig {
   bool latency_histograms_enabled_ = false;
   // Create and use Summaries for per-model latency related metrics
   bool latency_summaries_enabled_ = false;
-  // Buckets used for any histogram metrics. Each value represents
-  // a bucket boundary. For example, {100, 500, 2000, 5000} are latencies
+  // Default bucket boundaries used for each histogram metric. Each value
+  // represents a boundary. For example, {100, 500, 2000, 5000} are latencies.
   // in milliseconds in first_response_histogram.
-  prometheus::Histogram::BucketBoundaries buckets_ = {100, 500, 2000, 5000};
+  std::unordered_map<std::string, prometheus::Histogram::BucketBoundaries>
+      histogram_options_ = {{kFirstResponseHistogram, {100, 500, 2000, 5000}}};
+
   // Quantiles used for any summary metrics. Each pair of values represents
   // { quantile, error }. For example, {0.90, 0.01} means to compute the
   // 90th percentile with 1% error on either side, so the approximate 90th
@@ -73,6 +77,14 @@ struct MetricReporterConfig {
   bool cache_enabled_ = false;
 
   bool is_decoupled_ = false;
+
+ private:
+  // Maps the metric family fullname to its lookup key. This field is required
+  // because the users are expected to configure metric configuration
+  // "ModelMetrics" with the full name displayed from metrics reporting while a
+  // different name is used internally. All new histograms must update the map.
+  const std::unordered_map<std::string, std::string> metric_map_ = {
+      {"nv_inference_first_response_histogram_ms", kFirstResponseHistogram}};
 #endif  // TRITON_ENABLE_METRICS
 };
 
@@ -86,7 +98,9 @@ class MetricModelReporter {
       const triton::core::ModelIdentifier& model_id,
       const int64_t model_version, const int device,
       bool response_cache_enabled, bool is_decoupled,
+      // FIXME: [DLIS-7497] Merge model_tags with model_metrics
       const triton::common::MetricTagsMap& model_tags,
+      const inference::ModelMetrics& model_metrics,
       std::shared_ptr<MetricModelReporter>* metric_model_reporter);
 
   ~MetricModelReporter();
@@ -112,7 +126,8 @@ class MetricModelReporter {
   MetricModelReporter(
       const ModelIdentifier& model_id, const int64_t model_version,
       const int device, bool response_cache_enabled, bool is_decoupled,
-      const triton::common::MetricTagsMap& model_tags);
+      const triton::common::MetricTagsMap& model_tags,
+      const inference::ModelMetrics& model_metrics);
 
   static void GetMetricLabels(
       std::map<std::string, std::string>* labels,
diff --git a/src/metrics.cc b/src/metrics.cc
index f9b141b73..ed66d7781 100644
--- a/src/metrics.cc
+++ b/src/metrics.cc
@@ -109,12 +109,6 @@ Metrics::Metrics()
                     "execution per-model.")
               .Register(*registry_)),
 
-      inf_first_response_histogram_ms_family_(
-          prometheus::BuildHistogram()
-              .Name("nv_inference_first_response_histogram_ms")
-              .Help("Duration from request to first response in milliseconds")
-              .Register(*registry_)),
-
       model_load_time_family_(prometheus::BuildGauge()
                                   .Name("nv_model_load_duration_secs")
                                   .Help("Model load time in seconds")
@@ -155,6 +149,14 @@ Metrics::Metrics()
                     "microseconds")
               .Register(*registry_)),
 
+      // Histograms
+      // New histograms must be added to MetricReporterConfig.metric_map_
+      inf_first_response_histogram_ms_family_(
+          prometheus::BuildHistogram()
+              .Name("nv_inference_first_response_histogram_ms")
+              .Help("Duration from request to first response in milliseconds")
+              .Register(*registry_)),
+
       // Summaries
       inf_request_summary_us_family_(
           prometheus::BuildSummary()
diff --git a/src/metrics.h b/src/metrics.h
index af983cdca..ac04ebebc 100644
--- a/src/metrics.h
+++ b/src/metrics.h
@@ -312,8 +312,6 @@ class Metrics {
   prometheus::Family<prometheus::Counter>&
       inf_compute_output_duration_us_family_;
   prometheus::Family<prometheus::Gauge>& inf_pending_request_count_family_;
-  prometheus::Family<prometheus::Histogram>&
-      inf_first_response_histogram_ms_family_;
   prometheus::Family<prometheus::Gauge>& model_load_time_family_;
 
   prometheus::Family<prometheus::Gauge>& pinned_memory_pool_total_family_;
@@ -330,6 +328,10 @@ class Metrics {
   prometheus::Family<prometheus::Counter>& cache_num_misses_model_family_;
   prometheus::Family<prometheus::Counter>& cache_miss_duration_us_model_family_;
 
+  // Histograms
+  prometheus::Family<prometheus::Histogram>&
+      inf_first_response_histogram_ms_family_;
+
   // Summaries
   prometheus::Family<prometheus::Summary>& inf_request_summary_us_family_;
   prometheus::Family<prometheus::Summary>& inf_queue_summary_us_family_;
diff --git a/src/model.cc b/src/model.cc
index 37fb87de2..9cc3a724b 100644
--- a/src/model.cc
+++ b/src/model.cc
@@ -135,7 +135,8 @@ Model::Init(const bool is_config_provided)
 #ifdef TRITON_ENABLE_METRICS
   MetricModelReporter::Create(
       ModelId(), Version(), METRIC_REPORTER_ID_UTILITY, ResponseCacheEnabled(),
-      IsDecoupled(), Config().metric_tags(), &reporter_);
+      IsDecoupled(), Config().metric_tags(), Config().model_metrics(),
+      &reporter_);
 #endif  // TRITON_ENABLE_METRICS
 
   return Status::Success;
diff --git a/src/model_config_utils.cc b/src/model_config_utils.cc
index 4fd1e1be0..c5ab257e9 100644
--- a/src/model_config_utils.cc
+++ b/src/model_config_utils.cc
@@ -446,6 +446,39 @@ ValidateNonLinearFormatIO(
   return Status::Success;
 }
 
+// Helper function to validate that model_metrics contains all required data.
+Status
+ValidateModelMetrics(const inference::ModelMetrics& model_metrics)
+{
+  for (const auto& metric_control : model_metrics.metric_control()) {
+    if (!metric_control.has_metric_identifier()) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "metric control must specify 'metric_identifier'");
+    }
+
+    if (metric_control.metric_identifier().family().empty()) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "metric identifier must specify non-empty 'family'");
+    }
+
+    if (!metric_control.has_histogram_options()) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "metric control must specify 'histogram_options'");
+    }
+
+    if (metric_control.histogram_options().buckets_size() == 0) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "histogram options must specify non-empty 'buckets'");
+    }
+  }
+
+  return Status::Success;
+}
+
 }  // namespace
 
 Status
@@ -1591,7 +1624,7 @@ ValidateModelConfig(
     }
   }
 
-  // If ensemble scheduling is specified, validate it.  Otherwise,
+  // If ensemble scheduling is specified, validate it. Otherwise,
   // must validate platform and instance_group
   if (config.has_ensemble_scheduling()) {
 #ifdef TRITON_ENABLE_ENSEMBLE
@@ -1620,6 +1653,14 @@ ValidateModelConfig(
             " cache.");
   }
 
+  // If model_metric is specified, validate it.
+  if (config.has_model_metrics()) {
+#ifdef TRITON_ENABLE_METRICS
+    RETURN_IF_ERROR(ValidateModelMetrics(config.model_metrics()));
+#else
+    return Status(Status::Code::INVALID_ARG, "metrics not supported");
+#endif  // TRITON_ENABLE_METRICS
+  }
   return Status::Success;
 }