diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc index 1aa8a9c48..87b602c82 100644 --- a/src/backend_model_instance.cc +++ b/src/backend_model_instance.cc @@ -192,7 +192,7 @@ TritonModelInstance::TritonModelInstance( model_->Server()->ResponseCacheEnabled(); MetricModelReporter::Create( model_->ModelId(), model_->Version(), id, response_cache_enabled, - model_->Config().metric_tags(), &reporter_); + model_->IsDecoupled(), model_->Config().metric_tags(), &reporter_); } #endif // TRITON_ENABLE_METRICS } diff --git a/src/ensemble_scheduler/ensemble_scheduler.cc b/src/ensemble_scheduler/ensemble_scheduler.cc index 609c1ff28..64f118144 100644 --- a/src/ensemble_scheduler/ensemble_scheduler.cc +++ b/src/ensemble_scheduler/ensemble_scheduler.cc @@ -1470,12 +1470,13 @@ EnsembleScheduler::EnsembleScheduler( } #endif // TRITON_ENABLE_GPU + const bool is_decoupled = config.model_transaction_policy().decoupled(); #ifdef TRITON_ENABLE_METRICS if (Metrics::Enabled()) { // Ensemble scheduler doesn't currently support response cache at top level. MetricModelReporter::Create( model_id, 1 /* model_version */, METRIC_REPORTER_ID_CPU, - false /* response_cache_enabled */, config.metric_tags(), + false /* response_cache_enabled */, is_decoupled, config.metric_tags(), &metric_reporter_); } #endif // TRITON_ENABLE_METRICS @@ -1486,7 +1487,7 @@ EnsembleScheduler::EnsembleScheduler( info_->ensemble_name_ = config.name(); // This config field is filled internally for ensemble models - info_->is_decoupled_ = config.model_transaction_policy().decoupled(); + info_->is_decoupled_ = is_decoupled; // field to check if response cache enabled in the ensemble model config. info_->is_cache_enabled_ = diff --git a/src/infer_response.cc b/src/infer_response.cc index 6eec9d1a3..498036cde 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -42,7 +42,12 @@ InferenceResponseFactory::CreateResponse( { response->reset(new InferenceResponse( model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_, - response_delegator_)); + response_delegator_ +#ifdef TRITON_ENABLE_METRICS + , + responses_sent_, infer_start_ns_ +#endif // TRITON_ENABLE_METRICS + )); #ifdef TRITON_ENABLE_TRACING (*response)->SetTrace(trace_); #endif // TRITON_ENABLE_TRACING @@ -72,10 +77,21 @@ InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, const std::function< - void(std::unique_ptr&&, const uint32_t)>& delegator) + void(std::unique_ptr&&, const uint32_t)>& delegator +#ifdef TRITON_ENABLE_METRICS + , + std::shared_ptr> responses_sent, + uint64_t infer_start_ns +#endif // TRITON_ENABLE_METRICS + ) : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), - response_delegator_(delegator), null_response_(false) + response_delegator_(delegator), +#ifdef TRITON_ENABLE_METRICS + responses_sent_(std::move(responses_sent)), + infer_start_ns_(infer_start_ns), +#endif // TRITON_ENABLE_METRICS + null_response_(false) { // If the allocator has a start_fn then invoke it. TRITONSERVER_ResponseAllocatorStartFn_t start_fn = allocator_->StartFn(); @@ -93,6 +109,9 @@ InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp) : response_fn_(response_fn), response_userp_(response_userp), +#ifdef TRITON_ENABLE_METRICS + responses_sent_(nullptr), infer_start_ns_(0), +#endif // TRITON_ENABLE_METRICS null_response_(true) { } @@ -214,6 +233,10 @@ InferenceResponse::Send( TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT, "InferenceResponse Send"); #endif // TRITON_ENABLE_TRACING +#ifdef TRITON_ENABLE_METRICS + response->UpdateResponseMetrics(); +#endif // TRITON_ENABLE_METRICS + if (response->response_delegator_ != nullptr) { auto ldelegator = std::move(response->response_delegator_); ldelegator(std::move(response), flags); @@ -282,6 +305,25 @@ InferenceResponse::TraceOutputTensors( } #endif // TRITON_ENABLE_TRACING +#ifdef TRITON_ENABLE_METRICS +void +InferenceResponse::UpdateResponseMetrics() const +{ + // Report inference to first response duration. + if (model_ != nullptr && responses_sent_ != nullptr && + responses_sent_->fetch_add(1, std::memory_order_relaxed) == 0) { + auto now_ns = std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); + if (auto reporter = model_->MetricReporter()) { + reporter->ObserveHistogram( + "first_response_histogram", + (now_ns - infer_start_ns_) / NANOS_PER_MILLIS); + } + } +} +#endif // TRITON_ENABLE_METRICS + // // InferenceResponse::Output // diff --git a/src/infer_response.h b/src/infer_response.h index 612f8c1fe..700d14a3c 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -61,11 +61,20 @@ class InferenceResponseFactory { alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), response_delegator_(delegator), is_cancelled_(false) +#ifdef TRITON_ENABLE_METRICS + , + responses_sent_(std::make_shared>(0)) +#endif // TRITON_ENABLE_METRICS #ifdef TRITON_ENABLE_STATS , response_stats_index_(0) #endif // TRITON_ENABLE_STATS { +#ifdef TRITON_ENABLE_METRICS + infer_start_ns_ = std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); +#endif // TRITON_ENABLE_METRICS } void Cancel() { is_cancelled_ = true; } @@ -134,6 +143,14 @@ class InferenceResponseFactory { std::atomic is_cancelled_; +#ifdef TRITON_ENABLE_METRICS + // Total number of responses sent created by this response factory. + std::shared_ptr> responses_sent_; + + // The start time of associate request in ns. + uint64_t infer_start_ns_; +#endif // TRITON_ENABLE_METRICS + #ifdef TRITON_ENABLE_TRACING // Inference trace associated with this response. std::shared_ptr trace_; @@ -246,8 +263,14 @@ class InferenceResponse { const ResponseAllocator* allocator, void* alloc_userp, TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, - const std::function&&, const uint32_t)>& delegator); + const std::function< + void(std::unique_ptr&&, const uint32_t)>& delegator +#ifdef TRITON_ENABLE_METRICS + , + std::shared_ptr> responses_sent, + uint64_t infer_start_ns +#endif // TRITON_ENABLE_METRICS + ); // "null" InferenceResponse is a special instance of InferenceResponse which // contains minimal information for calling InferenceResponse::Send, @@ -324,6 +347,10 @@ class InferenceResponse { TRITONSERVER_InferenceTraceActivity activity, const std::string& msg); #endif // TRITON_ENABLE_TRACING +#ifdef TRITON_ENABLE_METRICS + void UpdateResponseMetrics() const; +#endif // TRITON_ENABLE_METRICS + // The model associated with this factory. For normal // requests/responses this will always be defined and acts to keep // the model loaded as long as this factory is live. It may be @@ -358,6 +385,14 @@ class InferenceResponse { std::function&&, const uint32_t)> response_delegator_; +#ifdef TRITON_ENABLE_METRICS + // Total number of responses sent created by its response factory. + const std::shared_ptr> responses_sent_; + + // The start time of associate request in ns. + const uint64_t infer_start_ns_; +#endif // TRITON_ENABLE_METRICS + bool null_response_; #ifdef TRITON_ENABLE_TRACING diff --git a/src/metric_model_reporter.cc b/src/metric_model_reporter.cc index 71d3eca94..a10db8f64 100644 --- a/src/metric_model_reporter.cc +++ b/src/metric_model_reporter.cc @@ -41,7 +41,8 @@ namespace triton { namespace core { // MetricReporterConfig // void -MetricReporterConfig::ParseConfig(bool response_cache_enabled) +MetricReporterConfig::ParseConfig( + bool response_cache_enabled, bool is_decoupled) { // Global config only for now in config map auto metrics_config_map = Metrics::ConfigMap(); @@ -53,6 +54,10 @@ MetricReporterConfig::ParseConfig(bool response_cache_enabled) latency_counters_enabled_ = false; } + if (pair.first == "histogram_latencies" && pair.second == "true") { + latency_histograms_enabled_ = true; + } + if (pair.first == "summary_latencies" && pair.second == "true") { latency_summaries_enabled_ = true; } @@ -68,6 +73,7 @@ MetricReporterConfig::ParseConfig(bool response_cache_enabled) // Set flag to signal to stats aggregator if caching is enabled or not cache_enabled_ = response_cache_enabled; + is_decoupled_ = is_decoupled; } prometheus::Summary::Quantiles @@ -112,7 +118,7 @@ const std::map Status MetricModelReporter::Create( const ModelIdentifier& model_id, const int64_t model_version, - const int device, bool response_cache_enabled, + const int device, bool response_cache_enabled, bool is_decoupled, const triton::common::MetricTagsMap& model_tags, std::shared_ptr* metric_model_reporter) { @@ -141,25 +147,27 @@ MetricModelReporter::Create( } metric_model_reporter->reset(new MetricModelReporter( - model_id, model_version, device, response_cache_enabled, model_tags)); + model_id, model_version, device, response_cache_enabled, is_decoupled, + model_tags)); reporter_map.insert({hash_labels, *metric_model_reporter}); return Status::Success; } MetricModelReporter::MetricModelReporter( const ModelIdentifier& model_id, const int64_t model_version, - const int device, bool response_cache_enabled, + const int device, bool response_cache_enabled, bool is_decoupled, const triton::common::MetricTagsMap& model_tags) { std::map labels; GetMetricLabels(&labels, model_id, model_version, device, model_tags); // Parse metrics config to control metric setup and behavior - config_.ParseConfig(response_cache_enabled); + config_.ParseConfig(response_cache_enabled, is_decoupled); // Initialize families and metrics InitializeCounters(labels); InitializeGauges(labels); + InitializeHistograms(labels); InitializeSummaries(labels); } @@ -182,6 +190,14 @@ MetricModelReporter::~MetricModelReporter() } } + for (auto& iter : histogram_families_) { + const auto& name = iter.first; + auto family_ptr = iter.second; + if (family_ptr) { + family_ptr->Remove(histograms_[name]); + } + } + for (auto& iter : summary_families_) { const auto& name = iter.first; auto family_ptr = iter.second; @@ -262,6 +278,28 @@ MetricModelReporter::InitializeGauges( } } +void +MetricModelReporter::InitializeHistograms( + const std::map& labels) +{ + // Only create response metrics if decoupled model to reduce metric output + if (config_.latency_histograms_enabled_) { + if (config_.is_decoupled_) { + histogram_families_["first_response_histogram"] = + &Metrics::FamilyFirstResponseDuration(); + } + } + + for (auto& iter : histogram_families_) { + const auto& name = iter.first; + auto family_ptr = iter.second; + if (family_ptr) { + histograms_[name] = CreateMetric( + *family_ptr, labels, config_.buckets_); + } + } +} + void MetricModelReporter::InitializeSummaries( const std::map& labels) @@ -408,6 +446,23 @@ MetricModelReporter::DecrementGauge(const std::string& name, double value) IncrementGauge(name, -1 * value); } +void +MetricModelReporter::ObserveHistogram(const std::string& name, double value) +{ + auto iter = histograms_.find(name); + if (iter == histograms_.end()) { + // No histogram metric exists with this name + return; + } + + auto histogram = iter->second; + if (!histogram) { + // histogram is uninitialized/nullptr + return; + } + histogram->Observe(value); +} + void MetricModelReporter::ObserveSummary(const std::string& name, double value) { diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h index 6482cabb6..236bc8f5f 100644 --- a/src/metric_model_reporter.h +++ b/src/metric_model_reporter.h @@ -46,15 +46,21 @@ struct ModelIdentifier; struct MetricReporterConfig { #ifdef TRITON_ENABLE_METRICS // Parses Metrics::ConfigMap and sets fields if specified - void ParseConfig(bool response_cache_enabled); + void ParseConfig(bool response_cache_enabled, bool is_decoupled); // Parses pairs of quantiles "quantile1:error1, quantile2:error2, ..." // and overwrites quantiles_ field if successful. prometheus::Summary::Quantiles ParseQuantiles(std::string options); // Create and use Counters for per-model latency related metrics bool latency_counters_enabled_ = true; + // Create and use Histograms for per-model latency related metrics + bool latency_histograms_enabled_ = false; // Create and use Summaries for per-model latency related metrics bool latency_summaries_enabled_ = false; + // Buckets used for any histogram metrics. Each value represents + // a bucket boundary. For example, {100, 500, 2000, 5000} are latencies + // in milliseconds in first_response_histogram. + prometheus::Histogram::BucketBoundaries buckets_ = {100, 500, 2000, 5000}; // Quantiles used for any summary metrics. Each pair of values represents // { quantile, error }. For example, {0.90, 0.01} means to compute the // 90th percentile with 1% error on either side, so the approximate 90th @@ -65,6 +71,8 @@ struct MetricReporterConfig { // Whether this reporter's model has caching enabled or not. // This helps handle infer_stats aggregation for summaries on cache misses. bool cache_enabled_ = false; + + bool is_decoupled_ = false; #endif // TRITON_ENABLE_METRICS }; @@ -77,7 +85,7 @@ class MetricModelReporter { static Status Create( const triton::core::ModelIdentifier& model_id, const int64_t model_version, const int device, - bool response_cache_enabled, + bool response_cache_enabled, bool is_decoupled, const triton::common::MetricTagsMap& model_tags, std::shared_ptr* metric_model_reporter); @@ -93,6 +101,8 @@ class MetricModelReporter { void IncrementGauge(const std::string& name, double value); // Decrease gauge by value. void DecrementGauge(const std::string& name, double value); + // Lookup histogram metric by name, and observe the value if it exists. + void ObserveHistogram(const std::string& name, double value); // Lookup summary metric by name, and observe the value if it exists. void ObserveSummary(const std::string& name, double value); @@ -101,7 +111,7 @@ class MetricModelReporter { private: MetricModelReporter( const ModelIdentifier& model_id, const int64_t model_version, - const int device, bool response_cache_enabled, + const int device, bool response_cache_enabled, bool is_decoupled, const triton::common::MetricTagsMap& model_tags); static void GetMetricLabels( @@ -116,6 +126,7 @@ class MetricModelReporter { void InitializeCounters(const std::map& labels); void InitializeGauges(const std::map& labels); + void InitializeHistograms(const std::map& labels); void InitializeSummaries(const std::map& labels); // Lookup gauge metric by name. Return gauge if found, nullptr otherwise. @@ -127,12 +138,15 @@ class MetricModelReporter { counter_families_; std::unordered_map*> gauge_families_; + std::unordered_map*> + histogram_families_; std::unordered_map*> summary_families_; // Metrics std::unordered_map counters_; std::unordered_map gauges_; + std::unordered_map histograms_; std::unordered_map summaries_; // Config diff --git a/src/metrics.cc b/src/metrics.cc index 775c46b97..f9b141b73 100644 --- a/src/metrics.cc +++ b/src/metrics.cc @@ -1,4 +1,4 @@ -// Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -109,6 +109,12 @@ Metrics::Metrics() "execution per-model.") .Register(*registry_)), + inf_first_response_histogram_ms_family_( + prometheus::BuildHistogram() + .Name("nv_inference_first_response_histogram_ms") + .Help("Duration from request to first response in milliseconds") + .Register(*registry_)), + model_load_time_family_(prometheus::BuildGauge() .Name("nv_model_load_duration_secs") .Help("Model load time in seconds") diff --git a/src/metrics.h b/src/metrics.h index 0ae516368..af983cdca 100644 --- a/src/metrics.h +++ b/src/metrics.h @@ -215,6 +215,12 @@ class Metrics { return GetSingleton()->inf_pending_request_count_family_; } + static prometheus::Family& + FamilyFirstResponseDuration() + { + return GetSingleton()->inf_first_response_histogram_ms_family_; + } + // Metric family of load time per model static prometheus::Family& FamilyModelLoadTime() { @@ -306,6 +312,8 @@ class Metrics { prometheus::Family& inf_compute_output_duration_us_family_; prometheus::Family& inf_pending_request_count_family_; + prometheus::Family& + inf_first_response_histogram_ms_family_; prometheus::Family& model_load_time_family_; prometheus::Family& pinned_memory_pool_total_family_; diff --git a/src/model.cc b/src/model.cc index 9da281ed5..37fb87de2 100644 --- a/src/model.cc +++ b/src/model.cc @@ -135,7 +135,7 @@ Model::Init(const bool is_config_provided) #ifdef TRITON_ENABLE_METRICS MetricModelReporter::Create( ModelId(), Version(), METRIC_REPORTER_ID_UTILITY, ResponseCacheEnabled(), - Config().metric_tags(), &reporter_); + IsDecoupled(), Config().metric_tags(), &reporter_); #endif // TRITON_ENABLE_METRICS return Status::Success; diff --git a/src/model.h b/src/model.h index 286ca8db1..2acf63b11 100644 --- a/src/model.h +++ b/src/model.h @@ -148,6 +148,12 @@ class Model { return config_.response_cache().enable(); } + // Get whether the model is decoupled. + bool IsDecoupled() const + { + return config_.model_transaction_policy().decoupled(); + } + // Get the number of required inputs size_t RequiredInputCount() const { return required_input_count_; } diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc index 8ffb85bd6..6d7d35db7 100644 --- a/src/test/response_cache_test.cc +++ b/src/test/response_cache_test.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -46,7 +46,12 @@ InferenceResponseFactory::CreateResponse( { response->reset(new InferenceResponse( model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_, - response_delegator_)); + response_delegator_ +#ifdef TRITON_ENABLE_METRICS + , + responses_sent_, infer_start_ns_ +#endif // TRITON_ENABLE_METRICS + )); return Status::Success; } @@ -181,10 +186,19 @@ InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, const std::function< - void(std::unique_ptr&&, const uint32_t)>& delegator) + void(std::unique_ptr&&, const uint32_t)>& delegator +#ifdef TRITON_ENABLE_METRICS + , + uint64_t responses_sent, uint64_t infer_start_ns +#endif // TRITON_ENABLE_METRICS + ) : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), - response_delegator_(delegator), null_response_(false) + response_delegator_(delegator), +#ifdef TRITON_ENABLE_METRICS + responses_sent_(responses_sent), infer_start_ns_(infer_start_ns), +#endif // TRITON_ENABLE_METRICS + null_response_(false) { // Skip allocator logic / references in unit test }