From 1d18c2beb8360196ca526f6f05130107c7fb9dcf Mon Sep 17 00:00:00 2001 From: Yingge He <157551214+yinggeh@users.noreply.github.com> Date: Tue, 5 Nov 2024 17:21:20 -0800 Subject: [PATCH] feat: Per-model customization of histogram metric buckets (#405) --- src/backend_model_instance.cc | 3 +- src/constants.h | 1 + src/ensemble_scheduler/ensemble_scheduler.cc | 2 +- src/infer_response.cc | 2 +- src/metric_model_reporter.cc | 39 ++++++++++++++---- src/metric_model_reporter.h | 25 +++++++++--- src/metrics.cc | 14 ++++--- src/metrics.h | 6 ++- src/model.cc | 3 +- src/model_config_utils.cc | 43 +++++++++++++++++++- 10 files changed, 113 insertions(+), 25 deletions(-) diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc index 87b602c82..080091708 100644 --- a/src/backend_model_instance.cc +++ b/src/backend_model_instance.cc @@ -192,7 +192,8 @@ TritonModelInstance::TritonModelInstance( model_->Server()->ResponseCacheEnabled(); MetricModelReporter::Create( model_->ModelId(), model_->Version(), id, response_cache_enabled, - model_->IsDecoupled(), model_->Config().metric_tags(), &reporter_); + model_->IsDecoupled(), model_->Config().metric_tags(), + model_->Config().model_metrics(), &reporter_); } #endif // TRITON_ENABLE_METRICS } diff --git a/src/constants.h b/src/constants.h index 21943f5a2..8415f8ee9 100644 --- a/src/constants.h +++ b/src/constants.h @@ -86,6 +86,7 @@ constexpr char kInitialStateFolder[] = "initial_state"; // Metric names constexpr char kPendingRequestMetric[] = "inf_pending_request_count"; constexpr char kModelLoadTimeMetric[] = "model_load_time"; +constexpr char kFirstResponseHistogram[] = "first_response_histogram"; constexpr uint64_t NANOS_PER_SECOND = 1000000000; constexpr uint64_t NANOS_PER_MILLIS = 1000000; diff --git a/src/ensemble_scheduler/ensemble_scheduler.cc b/src/ensemble_scheduler/ensemble_scheduler.cc index 64f118144..90f1e0be8 100644 --- a/src/ensemble_scheduler/ensemble_scheduler.cc +++ b/src/ensemble_scheduler/ensemble_scheduler.cc @@ -1477,7 +1477,7 @@ EnsembleScheduler::EnsembleScheduler( MetricModelReporter::Create( model_id, 1 /* model_version */, METRIC_REPORTER_ID_CPU, false /* response_cache_enabled */, is_decoupled, config.metric_tags(), - &metric_reporter_); + config.model_metrics(), &metric_reporter_); } #endif // TRITON_ENABLE_METRICS diff --git a/src/infer_response.cc b/src/infer_response.cc index 498036cde..4096e152b 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -317,7 +317,7 @@ InferenceResponse::UpdateResponseMetrics() const .count(); if (auto reporter = model_->MetricReporter()) { reporter->ObserveHistogram( - "first_response_histogram", + kFirstResponseHistogram, (now_ns - infer_start_ns_) / NANOS_PER_MILLIS); } } diff --git a/src/metric_model_reporter.cc b/src/metric_model_reporter.cc index a10db8f64..24ce0551f 100644 --- a/src/metric_model_reporter.cc +++ b/src/metric_model_reporter.cc @@ -42,7 +42,8 @@ namespace triton { namespace core { // void MetricReporterConfig::ParseConfig( - bool response_cache_enabled, bool is_decoupled) + bool response_cache_enabled, bool is_decoupled, + const inference::ModelMetrics& model_metrics) { // Global config only for now in config map auto metrics_config_map = Metrics::ConfigMap(); @@ -74,6 +75,26 @@ MetricReporterConfig::ParseConfig( // Set flag to signal to stats aggregator if caching is enabled or not cache_enabled_ = response_cache_enabled; is_decoupled_ = is_decoupled; + + // Override default histogram options if set in model_metrics. + for (const auto& metric_control : model_metrics.metric_control()) { + const std::string& family_name = + metric_control.metric_identifier().family(); + + // If family name exists, override with new options. + if (metric_map_.find(family_name) != metric_map_.end()) { + // Copy protobuf RepeatedField to std::vector + const auto& buckets_proto = metric_control.histogram_options().buckets(); + const prometheus::Histogram::BucketBoundaries buckets( + buckets_proto.begin(), buckets_proto.end()); + histogram_options_[metric_map_.at(family_name)] = buckets; + } else { + // metric_control config may be extended to support backend metrics. + LOG_WARNING << "Metric family '" << family_name + << "' in 'metric_identifier' is not a customizable metric in " + "Triton core."; + } + } } prometheus::Summary::Quantiles @@ -120,6 +141,7 @@ MetricModelReporter::Create( const ModelIdentifier& model_id, const int64_t model_version, const int device, bool response_cache_enabled, bool is_decoupled, const triton::common::MetricTagsMap& model_tags, + const inference::ModelMetrics& model_metrics, std::shared_ptr* metric_model_reporter) { static std::mutex mtx; @@ -148,7 +170,7 @@ MetricModelReporter::Create( metric_model_reporter->reset(new MetricModelReporter( model_id, model_version, device, response_cache_enabled, is_decoupled, - model_tags)); + model_tags, model_metrics)); reporter_map.insert({hash_labels, *metric_model_reporter}); return Status::Success; } @@ -156,13 +178,14 @@ MetricModelReporter::Create( MetricModelReporter::MetricModelReporter( const ModelIdentifier& model_id, const int64_t model_version, const int device, bool response_cache_enabled, bool is_decoupled, - const triton::common::MetricTagsMap& model_tags) + const triton::common::MetricTagsMap& model_tags, + const inference::ModelMetrics& model_metrics) { std::map labels; GetMetricLabels(&labels, model_id, model_version, device, model_tags); // Parse metrics config to control metric setup and behavior - config_.ParseConfig(response_cache_enabled, is_decoupled); + config_.ParseConfig(response_cache_enabled, is_decoupled, model_metrics); // Initialize families and metrics InitializeCounters(labels); @@ -282,10 +305,11 @@ void MetricModelReporter::InitializeHistograms( const std::map& labels) { + // Update MetricReporterConfig::metric_map_ for new histograms. // Only create response metrics if decoupled model to reduce metric output if (config_.latency_histograms_enabled_) { if (config_.is_decoupled_) { - histogram_families_["first_response_histogram"] = + histogram_families_[kFirstResponseHistogram] = &Metrics::FamilyFirstResponseDuration(); } } @@ -294,8 +318,9 @@ MetricModelReporter::InitializeHistograms( const auto& name = iter.first; auto family_ptr = iter.second; if (family_ptr) { - histograms_[name] = CreateMetric( - *family_ptr, labels, config_.buckets_); + const auto& buckets = config_.histogram_options_[name]; + histograms_[name] = + CreateMetric(*family_ptr, labels, buckets); } } } diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h index 236bc8f5f..6cdb9f14f 100644 --- a/src/metric_model_reporter.h +++ b/src/metric_model_reporter.h @@ -46,7 +46,9 @@ struct ModelIdentifier; struct MetricReporterConfig { #ifdef TRITON_ENABLE_METRICS // Parses Metrics::ConfigMap and sets fields if specified - void ParseConfig(bool response_cache_enabled, bool is_decoupled); + void ParseConfig( + bool response_cache_enabled, bool is_decoupled, + const inference::ModelMetrics& model_metrics); // Parses pairs of quantiles "quantile1:error1, quantile2:error2, ..." // and overwrites quantiles_ field if successful. prometheus::Summary::Quantiles ParseQuantiles(std::string options); @@ -57,10 +59,12 @@ struct MetricReporterConfig { bool latency_histograms_enabled_ = false; // Create and use Summaries for per-model latency related metrics bool latency_summaries_enabled_ = false; - // Buckets used for any histogram metrics. Each value represents - // a bucket boundary. For example, {100, 500, 2000, 5000} are latencies + // Default bucket boundaries used for each histogram metric. Each value + // represents a boundary. For example, {100, 500, 2000, 5000} are latencies. // in milliseconds in first_response_histogram. - prometheus::Histogram::BucketBoundaries buckets_ = {100, 500, 2000, 5000}; + std::unordered_map + histogram_options_ = {{kFirstResponseHistogram, {100, 500, 2000, 5000}}}; + // Quantiles used for any summary metrics. Each pair of values represents // { quantile, error }. For example, {0.90, 0.01} means to compute the // 90th percentile with 1% error on either side, so the approximate 90th @@ -73,6 +77,14 @@ struct MetricReporterConfig { bool cache_enabled_ = false; bool is_decoupled_ = false; + + private: + // Maps the metric family fullname to its lookup key. This field is required + // because the users are expected to configure metric configuration + // "ModelMetrics" with the full name displayed from metrics reporting while a + // different name is used internally. All new histograms must update the map. + const std::unordered_map metric_map_ = { + {"nv_inference_first_response_histogram_ms", kFirstResponseHistogram}}; #endif // TRITON_ENABLE_METRICS }; @@ -86,7 +98,9 @@ class MetricModelReporter { const triton::core::ModelIdentifier& model_id, const int64_t model_version, const int device, bool response_cache_enabled, bool is_decoupled, + // FIXME: [DLIS-7497] Merge model_tags with model_metrics const triton::common::MetricTagsMap& model_tags, + const inference::ModelMetrics& model_metrics, std::shared_ptr* metric_model_reporter); ~MetricModelReporter(); @@ -112,7 +126,8 @@ class MetricModelReporter { MetricModelReporter( const ModelIdentifier& model_id, const int64_t model_version, const int device, bool response_cache_enabled, bool is_decoupled, - const triton::common::MetricTagsMap& model_tags); + const triton::common::MetricTagsMap& model_tags, + const inference::ModelMetrics& model_metrics); static void GetMetricLabels( std::map* labels, diff --git a/src/metrics.cc b/src/metrics.cc index f9b141b73..ed66d7781 100644 --- a/src/metrics.cc +++ b/src/metrics.cc @@ -109,12 +109,6 @@ Metrics::Metrics() "execution per-model.") .Register(*registry_)), - inf_first_response_histogram_ms_family_( - prometheus::BuildHistogram() - .Name("nv_inference_first_response_histogram_ms") - .Help("Duration from request to first response in milliseconds") - .Register(*registry_)), - model_load_time_family_(prometheus::BuildGauge() .Name("nv_model_load_duration_secs") .Help("Model load time in seconds") @@ -155,6 +149,14 @@ Metrics::Metrics() "microseconds") .Register(*registry_)), + // Histograms + // New histograms must be added to MetricReporterConfig.metric_map_ + inf_first_response_histogram_ms_family_( + prometheus::BuildHistogram() + .Name("nv_inference_first_response_histogram_ms") + .Help("Duration from request to first response in milliseconds") + .Register(*registry_)), + // Summaries inf_request_summary_us_family_( prometheus::BuildSummary() diff --git a/src/metrics.h b/src/metrics.h index af983cdca..ac04ebebc 100644 --- a/src/metrics.h +++ b/src/metrics.h @@ -312,8 +312,6 @@ class Metrics { prometheus::Family& inf_compute_output_duration_us_family_; prometheus::Family& inf_pending_request_count_family_; - prometheus::Family& - inf_first_response_histogram_ms_family_; prometheus::Family& model_load_time_family_; prometheus::Family& pinned_memory_pool_total_family_; @@ -330,6 +328,10 @@ class Metrics { prometheus::Family& cache_num_misses_model_family_; prometheus::Family& cache_miss_duration_us_model_family_; + // Histograms + prometheus::Family& + inf_first_response_histogram_ms_family_; + // Summaries prometheus::Family& inf_request_summary_us_family_; prometheus::Family& inf_queue_summary_us_family_; diff --git a/src/model.cc b/src/model.cc index 37fb87de2..9cc3a724b 100644 --- a/src/model.cc +++ b/src/model.cc @@ -135,7 +135,8 @@ Model::Init(const bool is_config_provided) #ifdef TRITON_ENABLE_METRICS MetricModelReporter::Create( ModelId(), Version(), METRIC_REPORTER_ID_UTILITY, ResponseCacheEnabled(), - IsDecoupled(), Config().metric_tags(), &reporter_); + IsDecoupled(), Config().metric_tags(), Config().model_metrics(), + &reporter_); #endif // TRITON_ENABLE_METRICS return Status::Success; diff --git a/src/model_config_utils.cc b/src/model_config_utils.cc index 4fd1e1be0..c5ab257e9 100644 --- a/src/model_config_utils.cc +++ b/src/model_config_utils.cc @@ -446,6 +446,39 @@ ValidateNonLinearFormatIO( return Status::Success; } +// Helper function to validate that model_metrics contains all required data. +Status +ValidateModelMetrics(const inference::ModelMetrics& model_metrics) +{ + for (const auto& metric_control : model_metrics.metric_control()) { + if (!metric_control.has_metric_identifier()) { + return Status( + Status::Code::INVALID_ARG, + "metric control must specify 'metric_identifier'"); + } + + if (metric_control.metric_identifier().family().empty()) { + return Status( + Status::Code::INVALID_ARG, + "metric identifier must specify non-empty 'family'"); + } + + if (!metric_control.has_histogram_options()) { + return Status( + Status::Code::INVALID_ARG, + "metric control must specify 'histogram_options'"); + } + + if (metric_control.histogram_options().buckets_size() == 0) { + return Status( + Status::Code::INVALID_ARG, + "histogram options must specify non-empty 'buckets'"); + } + } + + return Status::Success; +} + } // namespace Status @@ -1591,7 +1624,7 @@ ValidateModelConfig( } } - // If ensemble scheduling is specified, validate it. Otherwise, + // If ensemble scheduling is specified, validate it. Otherwise, // must validate platform and instance_group if (config.has_ensemble_scheduling()) { #ifdef TRITON_ENABLE_ENSEMBLE @@ -1620,6 +1653,14 @@ ValidateModelConfig( " cache."); } + // If model_metric is specified, validate it. + if (config.has_model_metrics()) { +#ifdef TRITON_ENABLE_METRICS + RETURN_IF_ERROR(ValidateModelMetrics(config.model_metrics())); +#else + return Status(Status::Code::INVALID_ARG, "metrics not supported"); +#endif // TRITON_ENABLE_METRICS + } return Status::Success; }