From b9231d8dd2d96a8bd9a56984b5a0a06e7c507844 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 11 Oct 2024 17:25:55 -0700 Subject: [PATCH 01/10] Add histogram support and new TTFT metric --- src/backend_model_instance.cc | 2 +- src/ensemble_scheduler/ensemble_scheduler.cc | 5 +- src/infer_response.cc | 53 ++++++++++++++-- src/infer_response.h | 35 ++++++++++- src/metric_model_reporter.cc | 65 ++++++++++++++++++-- src/metric_model_reporter.h | 17 ++++- src/metrics.cc | 8 ++- src/metrics.h | 8 +++ src/model.cc | 2 +- src/model.h | 5 ++ src/test/response_cache_test.cc | 25 ++++++-- 11 files changed, 198 insertions(+), 27 deletions(-) diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc index 1aa8a9c48..ba35e2fbd 100644 --- a/src/backend_model_instance.cc +++ b/src/backend_model_instance.cc @@ -192,7 +192,7 @@ TritonModelInstance::TritonModelInstance( model_->Server()->ResponseCacheEnabled(); MetricModelReporter::Create( model_->ModelId(), model_->Version(), id, response_cache_enabled, - model_->Config().metric_tags(), &reporter_); + model_->isDecoupled(), model_->Config().metric_tags(), &reporter_); } #endif // TRITON_ENABLE_METRICS } diff --git a/src/ensemble_scheduler/ensemble_scheduler.cc b/src/ensemble_scheduler/ensemble_scheduler.cc index b16567dd7..8cd8be5a6 100644 --- a/src/ensemble_scheduler/ensemble_scheduler.cc +++ b/src/ensemble_scheduler/ensemble_scheduler.cc @@ -1469,12 +1469,13 @@ EnsembleScheduler::EnsembleScheduler( } #endif // TRITON_ENABLE_GPU + const bool is_decoupled = config.model_transaction_policy().decoupled(); #ifdef TRITON_ENABLE_METRICS if (Metrics::Enabled()) { // Ensemble scheduler doesn't currently support response cache at top level. MetricModelReporter::Create( model_id, 1 /* model_version */, METRIC_REPORTER_ID_CPU, - false /* response_cache_enabled */, config.metric_tags(), + false /* response_cache_enabled */, is_decoupled, config.metric_tags(), &metric_reporter_); } #endif // TRITON_ENABLE_METRICS @@ -1485,7 +1486,7 @@ EnsembleScheduler::EnsembleScheduler( info_->ensemble_name_ = config.name(); // This config field is filled internally for ensemble models - info_->is_decoupled_ = config.model_transaction_policy().decoupled(); + info_->is_decoupled_ = is_decoupled; // field to check if response cache enabled in the ensemble model config. info_->is_cache_enabled_ = diff --git a/src/infer_response.cc b/src/infer_response.cc index 6eec9d1a3..3c2b39e17 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -38,14 +38,20 @@ namespace triton { namespace core { // Status InferenceResponseFactory::CreateResponse( - std::unique_ptr* response) const + std::unique_ptr* response) { response->reset(new InferenceResponse( model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_, - response_delegator_)); + response_delegator_, response_cnt_ +#ifdef TRITON_ENABLE_METRICS + , + infer_start_ns_ +#endif // TRITON_ENABLE_METRICS + )); #ifdef TRITON_ENABLE_TRACING (*response)->SetTrace(trace_); #endif // TRITON_ENABLE_TRACING + response_cnt_++; return Status::Success; } @@ -72,10 +78,20 @@ InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, const std::function< - void(std::unique_ptr&&, const uint32_t)>& delegator) + void(std::unique_ptr&&, const uint32_t)>& delegator, + uint64_t seq_num +#ifdef TRITON_ENABLE_METRICS + , + uint64_t infer_start_ns +#endif // TRITON_ENABLE_METRICS + ) : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), - response_delegator_(delegator), null_response_(false) + response_delegator_(delegator), seq_num_(seq_num), +#ifdef TRITON_ENABLE_METRICS + infer_start_ns_(infer_start_ns), +#endif // TRITON_ENABLE_METRICS + null_response_(false) { // If the allocator has a start_fn then invoke it. TRITONSERVER_ResponseAllocatorStartFn_t start_fn = allocator_->StartFn(); @@ -92,7 +108,10 @@ InferenceResponse::InferenceResponse( InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp) - : response_fn_(response_fn), response_userp_(response_userp), + : response_fn_(response_fn), response_userp_(response_userp), seq_num_(0), +#ifdef TRITON_ENABLE_METRICS + infer_start_ns_(0), +#endif // TRITON_ENABLE_METRICS null_response_(true) { } @@ -214,6 +233,10 @@ InferenceResponse::Send( TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT, "InferenceResponse Send"); #endif // TRITON_ENABLE_TRACING +#ifdef TRITON_ENABLE_METRICS + response->UpdateResponseMetrics(); +#endif // TRITON_ENABLE_METRICS + if (response->response_delegator_ != nullptr) { auto ldelegator = std::move(response->response_delegator_); ldelegator(std::move(response), flags); @@ -282,6 +305,24 @@ InferenceResponse::TraceOutputTensors( } #endif // TRITON_ENABLE_TRACING +#ifdef TRITON_ENABLE_METRICS +void +InferenceResponse::UpdateResponseMetrics() const +{ + if (model_ != nullptr && seq_num_ == 0) { + auto first_response_ns = + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); + if (auto reporter = model_->MetricReporter()) { + reporter->ObserveHistogram( + "first_response_histogram", + (first_response_ns - infer_start_ns_) / 1000000); + } + } +} +#endif // TRITON_ENABLE_METRICS + // // InferenceResponse::Output // diff --git a/src/infer_response.h b/src/infer_response.h index 612f8c1fe..88b158ab1 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -60,12 +60,17 @@ class InferenceResponseFactory { : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), response_delegator_(delegator), - is_cancelled_(false) + is_cancelled_(false), response_cnt_(0) #ifdef TRITON_ENABLE_STATS , response_stats_index_(0) #endif // TRITON_ENABLE_STATS { +#ifdef TRITON_ENABLE_METRICS + infer_start_ns_ = std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); +#endif // TRITON_ENABLE_METRICS } void Cancel() { is_cancelled_ = true; } @@ -84,7 +89,7 @@ class InferenceResponseFactory { } // Create a new response. - Status CreateResponse(std::unique_ptr* response) const; + Status CreateResponse(std::unique_ptr* response); // Send a "null" response with 'flags'. Status SendFlags(const uint32_t flags) const; @@ -134,6 +139,14 @@ class InferenceResponseFactory { std::atomic is_cancelled_; + // The number of responses created by this factory. + std::atomic response_cnt_; + +#ifdef TRITON_ENABLE_METRICS + // The start time of associate request in ns. + uint64_t infer_start_ns_; +#endif // TRITON_ENABLE_METRICS + #ifdef TRITON_ENABLE_TRACING // Inference trace associated with this response. std::shared_ptr trace_; @@ -247,7 +260,13 @@ class InferenceResponse { TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, const std::function&&, const uint32_t)>& delegator); + std::unique_ptr&&, const uint32_t)>& delegator, + uint64_t seq_num +#ifdef TRITON_ENABLE_METRICS + , + uint64_t infer_start_ns +#endif // TRITON_ENABLE_METRICS + ); // "null" InferenceResponse is a special instance of InferenceResponse which // contains minimal information for calling InferenceResponse::Send, @@ -324,6 +343,11 @@ class InferenceResponse { TRITONSERVER_InferenceTraceActivity activity, const std::string& msg); #endif // TRITON_ENABLE_TRACING + +#ifdef TRITON_ENABLE_METRICS + void UpdateResponseMetrics() const; +#endif // TRITON_ENABLE_METRICS + // The model associated with this factory. For normal // requests/responses this will always be defined and acts to keep // the model loaded as long as this factory is live. It may be @@ -358,6 +382,11 @@ class InferenceResponse { std::function&&, const uint32_t)> response_delegator_; + const uint64_t seq_num_; +#ifdef TRITON_ENABLE_METRICS + const uint64_t infer_start_ns_; +#endif // TRITON_ENABLE_METRICS + bool null_response_; #ifdef TRITON_ENABLE_TRACING diff --git a/src/metric_model_reporter.cc b/src/metric_model_reporter.cc index 9dd9122be..75d2e87a5 100644 --- a/src/metric_model_reporter.cc +++ b/src/metric_model_reporter.cc @@ -41,7 +41,8 @@ namespace triton { namespace core { // MetricReporterConfig // void -MetricReporterConfig::ParseConfig(bool response_cache_enabled) +MetricReporterConfig::ParseConfig( + bool response_cache_enabled, bool is_decoupled) { // Global config only for now in config map auto metrics_config_map = Metrics::ConfigMap(); @@ -53,6 +54,10 @@ MetricReporterConfig::ParseConfig(bool response_cache_enabled) latency_counters_enabled_ = false; } + if (pair.first == "histogram_latencies" && pair.second == "false") { + latency_histograms_enabled_ = false; + } + if (pair.first == "summary_latencies" && pair.second == "true") { latency_summaries_enabled_ = true; } @@ -68,6 +73,7 @@ MetricReporterConfig::ParseConfig(bool response_cache_enabled) // Set flag to signal to stats aggregator if caching is enabled or not cache_enabled_ = response_cache_enabled; + is_decoupled_ = is_decoupled; } prometheus::Summary::Quantiles @@ -112,7 +118,7 @@ const std::map Status MetricModelReporter::Create( const ModelIdentifier& model_id, const int64_t model_version, - const int device, bool response_cache_enabled, + const int device, bool response_cache_enabled, bool is_decoupled, const triton::common::MetricTagsMap& model_tags, std::shared_ptr* metric_model_reporter) { @@ -141,25 +147,27 @@ MetricModelReporter::Create( } metric_model_reporter->reset(new MetricModelReporter( - model_id, model_version, device, response_cache_enabled, model_tags)); + model_id, model_version, device, response_cache_enabled, is_decoupled, + model_tags)); reporter_map.insert({hash_labels, *metric_model_reporter}); return Status::Success; } MetricModelReporter::MetricModelReporter( const ModelIdentifier& model_id, const int64_t model_version, - const int device, bool response_cache_enabled, + const int device, bool response_cache_enabled, bool is_decoupled, const triton::common::MetricTagsMap& model_tags) { std::map labels; GetMetricLabels(&labels, model_id, model_version, device, model_tags); // Parse metrics config to control metric setup and behavior - config_.ParseConfig(response_cache_enabled); + config_.ParseConfig(response_cache_enabled, is_decoupled); // Initialize families and metrics InitializeCounters(labels); InitializeGauges(labels); + InitializeHistograms(labels); InitializeSummaries(labels); } @@ -182,6 +190,14 @@ MetricModelReporter::~MetricModelReporter() } } + for (auto& iter : histogram_families_) { + const auto& name = iter.first; + auto family_ptr = iter.second; + if (family_ptr) { + family_ptr->Remove(histograms_[name]); + } + } + for (auto& iter : summary_families_) { const auto& name = iter.first; auto family_ptr = iter.second; @@ -261,6 +277,28 @@ MetricModelReporter::InitializeGauges( } } +void +MetricModelReporter::InitializeHistograms( + const std::map& labels) +{ + // Only create response metrics if decoupled model to reduce metric output + if (config_.latency_histograms_enabled_) { + if (config_.is_decoupled_) { + histogram_families_["first_response_histogram"] = + &Metrics::FamilyFirstResponseDuration(); + } + } + + for (auto& iter : histogram_families_) { + const auto& name = iter.first; + auto family_ptr = iter.second; + if (family_ptr) { + histograms_[name] = CreateMetric( + *family_ptr, labels, config_.buckets_); + } + } +} + void MetricModelReporter::InitializeSummaries( const std::map& labels) @@ -398,6 +436,23 @@ MetricModelReporter::DecrementGauge(const std::string& name, double value) IncrementGauge(name, -1 * value); } +void +MetricModelReporter::ObserveHistogram(const std::string& name, double value) +{ + auto iter = histograms_.find(name); + if (iter == histograms_.end()) { + // No histogram metric exists with this name + return; + } + + auto histogram = iter->second; + if (!histogram) { + // histogram is uninitialized/nullptr + return; + } + histogram->Observe(value); +} + void MetricModelReporter::ObserveSummary(const std::string& name, double value) { diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h index 9378905ae..5ab9f0201 100644 --- a/src/metric_model_reporter.h +++ b/src/metric_model_reporter.h @@ -46,25 +46,30 @@ struct ModelIdentifier; struct MetricReporterConfig { #ifdef TRITON_ENABLE_METRICS // Parses Metrics::ConfigMap and sets fields if specified - void ParseConfig(bool response_cache_enabled); + void ParseConfig(bool response_cache_enabled, bool is_decoupled); // Parses pairs of quantiles "quantile1:error1, quantile2:error2, ..." // and overwrites quantiles_ field if successful. prometheus::Summary::Quantiles ParseQuantiles(std::string options); // Create and use Counters for per-model latency related metrics bool latency_counters_enabled_ = true; + // Create and use Histograms for per-model latency related metrics + bool latency_histograms_enabled_ = true; // Create and use Summaries for per-model latency related metrics bool latency_summaries_enabled_ = false; // Quantiles used for any summary metrics. Each pair of values represents // { quantile, error }. For example, {0.90, 0.01} means to compute the // 90th percentile with 1% error on either side, so the approximate 90th // percentile value will be between the 89th and 91st percentiles. + prometheus::Histogram::BucketBoundaries buckets_ = {10, 100, 500, 1000}; prometheus::Summary::Quantiles quantiles_ = { {0.5, 0.05}, {0.9, 0.01}, {0.95, 0.001}, {0.99, 0.001}, {0.999, 0.001}}; // Whether this reporter's model has caching enabled or not. // This helps handle infer_stats aggregation for summaries on cache misses. bool cache_enabled_ = false; + + bool is_decoupled_ = false; #endif // TRITON_ENABLE_METRICS }; @@ -77,7 +82,7 @@ class MetricModelReporter { static Status Create( const triton::core::ModelIdentifier& model_id, const int64_t model_version, const int device, - bool response_cache_enabled, + bool response_cache_enabled, bool is_decoupled, const triton::common::MetricTagsMap& model_tags, std::shared_ptr* metric_model_reporter); @@ -91,6 +96,8 @@ class MetricModelReporter { void IncrementGauge(const std::string& name, double value); // Decrease gauge by value. void DecrementGauge(const std::string& name, double value); + // Lookup histogram metric by name, and observe the value if it exists. + void ObserveHistogram(const std::string& name, double value); // Lookup summary metric by name, and observe the value if it exists. void ObserveSummary(const std::string& name, double value); @@ -99,7 +106,7 @@ class MetricModelReporter { private: MetricModelReporter( const ModelIdentifier& model_id, const int64_t model_version, - const int device, bool response_cache_enabled, + const int device, bool response_cache_enabled, bool is_decoupled, const triton::common::MetricTagsMap& model_tags); static void GetMetricLabels( @@ -114,6 +121,7 @@ class MetricModelReporter { void InitializeCounters(const std::map& labels); void InitializeGauges(const std::map& labels); + void InitializeHistograms(const std::map& labels); void InitializeSummaries(const std::map& labels); // Lookup gauge metric by name. Return gauge if found, nullptr otherwise. @@ -125,12 +133,15 @@ class MetricModelReporter { counter_families_; std::unordered_map*> gauge_families_; + std::unordered_map*> + histogram_families_; std::unordered_map*> summary_families_; // Metrics std::unordered_map counters_; std::unordered_map gauges_; + std::unordered_map histograms_; std::unordered_map summaries_; // Config diff --git a/src/metrics.cc b/src/metrics.cc index 28ab921ca..3922dbccb 100644 --- a/src/metrics.cc +++ b/src/metrics.cc @@ -1,4 +1,4 @@ -// Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -109,6 +109,12 @@ Metrics::Metrics() "execution per-model.") .Register(*registry_)), + inf_first_response_histogram_ms_family_( + prometheus::BuildHistogram() + .Name("nv_inference_first_response_histogram_ms") + .Help("Duration from request to first response in milliseconds.") + .Register(*registry_)), + pinned_memory_pool_total_family_( prometheus::BuildGauge() .Name("nv_pinned_memory_pool_total_bytes") diff --git a/src/metrics.h b/src/metrics.h index 6d08ad168..833bab290 100644 --- a/src/metrics.h +++ b/src/metrics.h @@ -215,6 +215,12 @@ class Metrics { return GetSingleton()->inf_pending_request_count_family_; } + static prometheus::Family& + FamilyFirstResponseDuration() + { + return GetSingleton()->inf_first_response_histogram_ms_family_; + } + // Metric families of per-model response cache metrics // NOTE: These are used in infer_stats for perf_analyzer static prometheus::Family& FamilyCacheHitCount() @@ -300,6 +306,8 @@ class Metrics { prometheus::Family& inf_compute_output_duration_us_family_; prometheus::Family& inf_pending_request_count_family_; + prometheus::Family& + inf_first_response_histogram_ms_family_; prometheus::Family& pinned_memory_pool_total_family_; prometheus::Family& pinned_memory_pool_used_family_; diff --git a/src/model.cc b/src/model.cc index 9da281ed5..d15ca42d6 100644 --- a/src/model.cc +++ b/src/model.cc @@ -135,7 +135,7 @@ Model::Init(const bool is_config_provided) #ifdef TRITON_ENABLE_METRICS MetricModelReporter::Create( ModelId(), Version(), METRIC_REPORTER_ID_UTILITY, ResponseCacheEnabled(), - Config().metric_tags(), &reporter_); + isDecoupled(), Config().metric_tags(), &reporter_); #endif // TRITON_ENABLE_METRICS return Status::Success; diff --git a/src/model.h b/src/model.h index 286ca8db1..2374a81b3 100644 --- a/src/model.h +++ b/src/model.h @@ -148,6 +148,11 @@ class Model { return config_.response_cache().enable(); } + bool isDecoupled() const + { + return config_.model_transaction_policy().decoupled(); + } + // Get the number of required inputs size_t RequiredInputCount() const { return required_input_count_; } diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc index 8ffb85bd6..00a1826d8 100644 --- a/src/test/response_cache_test.cc +++ b/src/test/response_cache_test.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -42,11 +42,16 @@ namespace triton { namespace core { // Status InferenceResponseFactory::CreateResponse( - std::unique_ptr* response) const + std::unique_ptr* response) { response->reset(new InferenceResponse( model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_, - response_delegator_)); + response_delegator_, response_cnt_ +#ifdef TRITON_ENABLE_METRICS + , + infer_start_ns_ +#endif // TRITON_ENABLE_METRICS + )); return Status::Success; } @@ -181,10 +186,20 @@ InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, const std::function< - void(std::unique_ptr&&, const uint32_t)>& delegator) + void(std::unique_ptr&&, const uint32_t)>& delegator, + uint64_t seq_num +#ifdef TRITON_ENABLE_METRICS + , + uint64_t infer_start_ns +#endif // TRITON_ENABLE_METRICS + ) : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), - response_delegator_(delegator), null_response_(false) + response_delegator_(delegator), seq_num_(seq_num), +#ifdef TRITON_ENABLE_METRICS + infer_start_ns_(infer_start_ns), +#endif // TRITON_ENABLE_METRICS + null_response_(false) { // Skip allocator logic / references in unit test } From 8c9fe0c9d9eac48114ca5055eeb05eb3041e087f Mon Sep 17 00:00:00 2001 From: Yingge He Date: Mon, 14 Oct 2024 14:40:47 -0700 Subject: [PATCH 02/10] Minor update --- src/backend_model_instance.cc | 2 +- src/metrics.cc | 2 +- src/model.cc | 2 +- src/model.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc index ba35e2fbd..87b602c82 100644 --- a/src/backend_model_instance.cc +++ b/src/backend_model_instance.cc @@ -192,7 +192,7 @@ TritonModelInstance::TritonModelInstance( model_->Server()->ResponseCacheEnabled(); MetricModelReporter::Create( model_->ModelId(), model_->Version(), id, response_cache_enabled, - model_->isDecoupled(), model_->Config().metric_tags(), &reporter_); + model_->IsDecoupled(), model_->Config().metric_tags(), &reporter_); } #endif // TRITON_ENABLE_METRICS } diff --git a/src/metrics.cc b/src/metrics.cc index 3922dbccb..eb0d17280 100644 --- a/src/metrics.cc +++ b/src/metrics.cc @@ -112,7 +112,7 @@ Metrics::Metrics() inf_first_response_histogram_ms_family_( prometheus::BuildHistogram() .Name("nv_inference_first_response_histogram_ms") - .Help("Duration from request to first response in milliseconds.") + .Help("Duration from request to first response in milliseconds") .Register(*registry_)), pinned_memory_pool_total_family_( diff --git a/src/model.cc b/src/model.cc index d15ca42d6..37fb87de2 100644 --- a/src/model.cc +++ b/src/model.cc @@ -135,7 +135,7 @@ Model::Init(const bool is_config_provided) #ifdef TRITON_ENABLE_METRICS MetricModelReporter::Create( ModelId(), Version(), METRIC_REPORTER_ID_UTILITY, ResponseCacheEnabled(), - isDecoupled(), Config().metric_tags(), &reporter_); + IsDecoupled(), Config().metric_tags(), &reporter_); #endif // TRITON_ENABLE_METRICS return Status::Success; diff --git a/src/model.h b/src/model.h index 2374a81b3..4781020af 100644 --- a/src/model.h +++ b/src/model.h @@ -148,7 +148,7 @@ class Model { return config_.response_cache().enable(); } - bool isDecoupled() const + bool IsDecoupled() const { return config_.model_transaction_policy().decoupled(); } From db990d308ccfbbd06d10c9e4a6eca73a1becfa47 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Mon, 14 Oct 2024 19:01:03 -0700 Subject: [PATCH 03/10] Reuse and rename response_stats_index_ --- src/backend_model_instance.cc | 3 +-- src/infer_response.cc | 26 +++++++++++++++++++------- src/infer_response.h | 27 +++++++++++++++------------ src/test/response_cache_test.cc | 18 ++++++++++++++---- 4 files changed, 49 insertions(+), 25 deletions(-) diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc index 87b602c82..feb20115e 100644 --- a/src/backend_model_instance.cc +++ b/src/backend_model_instance.cc @@ -1103,8 +1103,7 @@ TRITONBACKEND_ModelInstanceReportResponseStatistics( InferenceStatsAggregator* sa = rs->model_instance->Model()->MutableStatsAggregator(); - std::string key = - std::to_string((*rs->response_factory)->GetAndIncrementResponseIndex()); + std::string key = std::to_string((*rs->response_factory)->GetResponseIndex()); if (rs->error == nullptr) { if (rs->compute_output_start > 0) { diff --git a/src/infer_response.cc b/src/infer_response.cc index 3c2b39e17..5d5603bd5 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -42,7 +42,11 @@ InferenceResponseFactory::CreateResponse( { response->reset(new InferenceResponse( model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_, - response_delegator_, response_cnt_ + response_delegator_ +#ifdef TRITON_ENABLE_STATS + , + response_index_++ +#endif // TRITON_ENABLE_STATS #ifdef TRITON_ENABLE_METRICS , infer_start_ns_ @@ -51,7 +55,6 @@ InferenceResponseFactory::CreateResponse( #ifdef TRITON_ENABLE_TRACING (*response)->SetTrace(trace_); #endif // TRITON_ENABLE_TRACING - response_cnt_++; return Status::Success; } @@ -78,8 +81,11 @@ InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, const std::function< - void(std::unique_ptr&&, const uint32_t)>& delegator, - uint64_t seq_num + void(std::unique_ptr&&, const uint32_t)>& delegator +#ifdef TRITON_ENABLE_STATS + , + uint64_t index +#endif // TRITON_ENABLE_STATS #ifdef TRITON_ENABLE_METRICS , uint64_t infer_start_ns @@ -87,7 +93,10 @@ InferenceResponse::InferenceResponse( ) : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), - response_delegator_(delegator), seq_num_(seq_num), + response_delegator_(delegator), +#ifdef TRITON_ENABLE_STATS + index_(index), +#endif // TRITON_ENABLE_STATS #ifdef TRITON_ENABLE_METRICS infer_start_ns_(infer_start_ns), #endif // TRITON_ENABLE_METRICS @@ -108,7 +117,10 @@ InferenceResponse::InferenceResponse( InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp) - : response_fn_(response_fn), response_userp_(response_userp), seq_num_(0), + : response_fn_(response_fn), response_userp_(response_userp), +#ifdef TRITON_ENABLE_STATS + index_(0), +#endif // TRITON_ENABLE_STATS #ifdef TRITON_ENABLE_METRICS infer_start_ns_(0), #endif // TRITON_ENABLE_METRICS @@ -309,7 +321,7 @@ InferenceResponse::TraceOutputTensors( void InferenceResponse::UpdateResponseMetrics() const { - if (model_ != nullptr && seq_num_ == 0) { + if (model_ != nullptr && index_ == 0) { auto first_response_ns = std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()) diff --git a/src/infer_response.h b/src/infer_response.h index 88b158ab1..ca2a9d4db 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -60,10 +60,10 @@ class InferenceResponseFactory { : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), response_delegator_(delegator), - is_cancelled_(false), response_cnt_(0) + is_cancelled_(false) #ifdef TRITON_ENABLE_STATS , - response_stats_index_(0) + response_index_(0) #endif // TRITON_ENABLE_STATS { #ifdef TRITON_ENABLE_METRICS @@ -104,8 +104,8 @@ class InferenceResponseFactory { #endif // TRITON_ENABLE_TRACING #ifdef TRITON_ENABLE_STATS - // Return the current response statistics index and increment it. - uint64_t GetAndIncrementResponseIndex() { return response_stats_index_++; }; + // Return the current response index. + uint64_t GetResponseIndex() { return response_index_; }; #endif // TRITON_ENABLE_STATS private: @@ -139,9 +139,6 @@ class InferenceResponseFactory { std::atomic is_cancelled_; - // The number of responses created by this factory. - std::atomic response_cnt_; - #ifdef TRITON_ENABLE_METRICS // The start time of associate request in ns. uint64_t infer_start_ns_; @@ -154,7 +151,7 @@ class InferenceResponseFactory { #ifdef TRITON_ENABLE_STATS // Number of response statistics reported. - std::atomic response_stats_index_; + std::atomic response_index_; #endif // TRITON_ENABLE_STATS }; @@ -259,9 +256,12 @@ class InferenceResponse { const ResponseAllocator* allocator, void* alloc_userp, TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, - const std::function&&, const uint32_t)>& delegator, - uint64_t seq_num + const std::function< + void(std::unique_ptr&&, const uint32_t)>& delegator +#ifdef TRITON_ENABLE_STATS + , + uint64_t index +#endif // TRITON_ENABLE_STATS #ifdef TRITON_ENABLE_METRICS , uint64_t infer_start_ns @@ -382,7 +382,10 @@ class InferenceResponse { std::function&&, const uint32_t)> response_delegator_; - const uint64_t seq_num_; +#ifdef TRITON_ENABLE_STATS + const uint64_t index_; +#endif // TRITON_ENABLE_STATS + #ifdef TRITON_ENABLE_METRICS const uint64_t infer_start_ns_; #endif // TRITON_ENABLE_METRICS diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc index 00a1826d8..801bcef32 100644 --- a/src/test/response_cache_test.cc +++ b/src/test/response_cache_test.cc @@ -46,7 +46,11 @@ InferenceResponseFactory::CreateResponse( { response->reset(new InferenceResponse( model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_, - response_delegator_, response_cnt_ + response_delegator_ +#ifdef TRITON_ENABLE_STATS + , + response_index_ +#endif // TRITON_ENABLE_STATS #ifdef TRITON_ENABLE_METRICS , infer_start_ns_ @@ -186,8 +190,11 @@ InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, const std::function< - void(std::unique_ptr&&, const uint32_t)>& delegator, - uint64_t seq_num + void(std::unique_ptr&&, const uint32_t)>& delegator +#ifdef TRITON_ENABLE_STATS + , + uint64_t index +#endif // TRITON_ENABLE_STATS #ifdef TRITON_ENABLE_METRICS , uint64_t infer_start_ns @@ -195,7 +202,10 @@ InferenceResponse::InferenceResponse( ) : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), - response_delegator_(delegator), seq_num_(seq_num), + response_delegator_(delegator), +#ifdef TRITON_ENABLE_STATS + index_(index), +#endif // TRITON_ENABLE_STATS #ifdef TRITON_ENABLE_METRICS infer_start_ns_(infer_start_ns), #endif // TRITON_ENABLE_METRICS From 189cf6445f593348daeb323451d2bf5fe72fc870 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Tue, 15 Oct 2024 10:35:25 -0700 Subject: [PATCH 04/10] Revert "Reuse and rename response_stats_index_" This reverts commit db990d308ccfbbd06d10c9e4a6eca73a1becfa47. --- src/backend_model_instance.cc | 3 ++- src/infer_response.cc | 26 +++++++------------------- src/infer_response.h | 27 ++++++++++++--------------- src/test/response_cache_test.cc | 18 ++++-------------- 4 files changed, 25 insertions(+), 49 deletions(-) diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc index feb20115e..87b602c82 100644 --- a/src/backend_model_instance.cc +++ b/src/backend_model_instance.cc @@ -1103,7 +1103,8 @@ TRITONBACKEND_ModelInstanceReportResponseStatistics( InferenceStatsAggregator* sa = rs->model_instance->Model()->MutableStatsAggregator(); - std::string key = std::to_string((*rs->response_factory)->GetResponseIndex()); + std::string key = + std::to_string((*rs->response_factory)->GetAndIncrementResponseIndex()); if (rs->error == nullptr) { if (rs->compute_output_start > 0) { diff --git a/src/infer_response.cc b/src/infer_response.cc index 5d5603bd5..3c2b39e17 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -42,11 +42,7 @@ InferenceResponseFactory::CreateResponse( { response->reset(new InferenceResponse( model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_, - response_delegator_ -#ifdef TRITON_ENABLE_STATS - , - response_index_++ -#endif // TRITON_ENABLE_STATS + response_delegator_, response_cnt_ #ifdef TRITON_ENABLE_METRICS , infer_start_ns_ @@ -55,6 +51,7 @@ InferenceResponseFactory::CreateResponse( #ifdef TRITON_ENABLE_TRACING (*response)->SetTrace(trace_); #endif // TRITON_ENABLE_TRACING + response_cnt_++; return Status::Success; } @@ -81,11 +78,8 @@ InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, const std::function< - void(std::unique_ptr&&, const uint32_t)>& delegator -#ifdef TRITON_ENABLE_STATS - , - uint64_t index -#endif // TRITON_ENABLE_STATS + void(std::unique_ptr&&, const uint32_t)>& delegator, + uint64_t seq_num #ifdef TRITON_ENABLE_METRICS , uint64_t infer_start_ns @@ -93,10 +87,7 @@ InferenceResponse::InferenceResponse( ) : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), - response_delegator_(delegator), -#ifdef TRITON_ENABLE_STATS - index_(index), -#endif // TRITON_ENABLE_STATS + response_delegator_(delegator), seq_num_(seq_num), #ifdef TRITON_ENABLE_METRICS infer_start_ns_(infer_start_ns), #endif // TRITON_ENABLE_METRICS @@ -117,10 +108,7 @@ InferenceResponse::InferenceResponse( InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp) - : response_fn_(response_fn), response_userp_(response_userp), -#ifdef TRITON_ENABLE_STATS - index_(0), -#endif // TRITON_ENABLE_STATS + : response_fn_(response_fn), response_userp_(response_userp), seq_num_(0), #ifdef TRITON_ENABLE_METRICS infer_start_ns_(0), #endif // TRITON_ENABLE_METRICS @@ -321,7 +309,7 @@ InferenceResponse::TraceOutputTensors( void InferenceResponse::UpdateResponseMetrics() const { - if (model_ != nullptr && index_ == 0) { + if (model_ != nullptr && seq_num_ == 0) { auto first_response_ns = std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()) diff --git a/src/infer_response.h b/src/infer_response.h index ca2a9d4db..88b158ab1 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -60,10 +60,10 @@ class InferenceResponseFactory { : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), response_delegator_(delegator), - is_cancelled_(false) + is_cancelled_(false), response_cnt_(0) #ifdef TRITON_ENABLE_STATS , - response_index_(0) + response_stats_index_(0) #endif // TRITON_ENABLE_STATS { #ifdef TRITON_ENABLE_METRICS @@ -104,8 +104,8 @@ class InferenceResponseFactory { #endif // TRITON_ENABLE_TRACING #ifdef TRITON_ENABLE_STATS - // Return the current response index. - uint64_t GetResponseIndex() { return response_index_; }; + // Return the current response statistics index and increment it. + uint64_t GetAndIncrementResponseIndex() { return response_stats_index_++; }; #endif // TRITON_ENABLE_STATS private: @@ -139,6 +139,9 @@ class InferenceResponseFactory { std::atomic is_cancelled_; + // The number of responses created by this factory. + std::atomic response_cnt_; + #ifdef TRITON_ENABLE_METRICS // The start time of associate request in ns. uint64_t infer_start_ns_; @@ -151,7 +154,7 @@ class InferenceResponseFactory { #ifdef TRITON_ENABLE_STATS // Number of response statistics reported. - std::atomic response_index_; + std::atomic response_stats_index_; #endif // TRITON_ENABLE_STATS }; @@ -256,12 +259,9 @@ class InferenceResponse { const ResponseAllocator* allocator, void* alloc_userp, TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, - const std::function< - void(std::unique_ptr&&, const uint32_t)>& delegator -#ifdef TRITON_ENABLE_STATS - , - uint64_t index -#endif // TRITON_ENABLE_STATS + const std::function&&, const uint32_t)>& delegator, + uint64_t seq_num #ifdef TRITON_ENABLE_METRICS , uint64_t infer_start_ns @@ -382,10 +382,7 @@ class InferenceResponse { std::function&&, const uint32_t)> response_delegator_; -#ifdef TRITON_ENABLE_STATS - const uint64_t index_; -#endif // TRITON_ENABLE_STATS - + const uint64_t seq_num_; #ifdef TRITON_ENABLE_METRICS const uint64_t infer_start_ns_; #endif // TRITON_ENABLE_METRICS diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc index 801bcef32..00a1826d8 100644 --- a/src/test/response_cache_test.cc +++ b/src/test/response_cache_test.cc @@ -46,11 +46,7 @@ InferenceResponseFactory::CreateResponse( { response->reset(new InferenceResponse( model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_, - response_delegator_ -#ifdef TRITON_ENABLE_STATS - , - response_index_ -#endif // TRITON_ENABLE_STATS + response_delegator_, response_cnt_ #ifdef TRITON_ENABLE_METRICS , infer_start_ns_ @@ -190,11 +186,8 @@ InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, const std::function< - void(std::unique_ptr&&, const uint32_t)>& delegator -#ifdef TRITON_ENABLE_STATS - , - uint64_t index -#endif // TRITON_ENABLE_STATS + void(std::unique_ptr&&, const uint32_t)>& delegator, + uint64_t seq_num #ifdef TRITON_ENABLE_METRICS , uint64_t infer_start_ns @@ -202,10 +195,7 @@ InferenceResponse::InferenceResponse( ) : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), - response_delegator_(delegator), -#ifdef TRITON_ENABLE_STATS - index_(index), -#endif // TRITON_ENABLE_STATS + response_delegator_(delegator), seq_num_(seq_num), #ifdef TRITON_ENABLE_METRICS infer_start_ns_(infer_start_ns), #endif // TRITON_ENABLE_METRICS From 51518d66961d708b4d51a3efed8458a03cbc7e34 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Tue, 15 Oct 2024 14:22:51 -0700 Subject: [PATCH 05/10] Update variable namings --- src/infer_response.cc | 8 ++++---- src/infer_response.h | 4 ++-- src/test/response_cache_test.cc | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/infer_response.cc b/src/infer_response.cc index 3c2b39e17..71c70b26d 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -79,7 +79,7 @@ InferenceResponse::InferenceResponse( void* response_userp, const std::function< void(std::unique_ptr&&, const uint32_t)>& delegator, - uint64_t seq_num + uint64_t seq_idx #ifdef TRITON_ENABLE_METRICS , uint64_t infer_start_ns @@ -87,7 +87,7 @@ InferenceResponse::InferenceResponse( ) : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), - response_delegator_(delegator), seq_num_(seq_num), + response_delegator_(delegator), seq_idx_(seq_idx), #ifdef TRITON_ENABLE_METRICS infer_start_ns_(infer_start_ns), #endif // TRITON_ENABLE_METRICS @@ -108,7 +108,7 @@ InferenceResponse::InferenceResponse( InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp) - : response_fn_(response_fn), response_userp_(response_userp), seq_num_(0), + : response_fn_(response_fn), response_userp_(response_userp), seq_idx_(0), #ifdef TRITON_ENABLE_METRICS infer_start_ns_(0), #endif // TRITON_ENABLE_METRICS @@ -309,7 +309,7 @@ InferenceResponse::TraceOutputTensors( void InferenceResponse::UpdateResponseMetrics() const { - if (model_ != nullptr && seq_num_ == 0) { + if (model_ != nullptr && seq_idx_ == 0) { auto first_response_ns = std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()) diff --git a/src/infer_response.h b/src/infer_response.h index 88b158ab1..b19eb9a8f 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -261,7 +261,7 @@ class InferenceResponse { void* response_userp, const std::function&&, const uint32_t)>& delegator, - uint64_t seq_num + uint64_t seq_idx #ifdef TRITON_ENABLE_METRICS , uint64_t infer_start_ns @@ -382,7 +382,7 @@ class InferenceResponse { std::function&&, const uint32_t)> response_delegator_; - const uint64_t seq_num_; + const uint64_t seq_idx_; #ifdef TRITON_ENABLE_METRICS const uint64_t infer_start_ns_; #endif // TRITON_ENABLE_METRICS diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc index 00a1826d8..166989d46 100644 --- a/src/test/response_cache_test.cc +++ b/src/test/response_cache_test.cc @@ -187,7 +187,7 @@ InferenceResponse::InferenceResponse( void* response_userp, const std::function< void(std::unique_ptr&&, const uint32_t)>& delegator, - uint64_t seq_num + uint64_t seq_idx #ifdef TRITON_ENABLE_METRICS , uint64_t infer_start_ns @@ -195,7 +195,7 @@ InferenceResponse::InferenceResponse( ) : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), - response_delegator_(delegator), seq_num_(seq_num), + response_delegator_(delegator), seq_idx_(seq_idx), #ifdef TRITON_ENABLE_METRICS infer_start_ns_(infer_start_ns), #endif // TRITON_ENABLE_METRICS From da632d1918c41ac4384010be462c325a1d69b519 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Wed, 16 Oct 2024 13:54:30 -0700 Subject: [PATCH 06/10] Fix the logic that finds the first response. --- src/infer_response.cc | 33 ++++++++++++++++----------------- src/infer_response.h | 26 ++++++++++++++++---------- src/test/response_cache_test.cc | 15 +++++++-------- 3 files changed, 39 insertions(+), 35 deletions(-) diff --git a/src/infer_response.cc b/src/infer_response.cc index 71c70b26d..092338d97 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -38,20 +38,19 @@ namespace triton { namespace core { // Status InferenceResponseFactory::CreateResponse( - std::unique_ptr* response) + std::unique_ptr* response) const { response->reset(new InferenceResponse( model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_, - response_delegator_, response_cnt_ + response_delegator_ #ifdef TRITON_ENABLE_METRICS , - infer_start_ns_ + responses_sent_, infer_start_ns_ #endif // TRITON_ENABLE_METRICS )); #ifdef TRITON_ENABLE_TRACING (*response)->SetTrace(trace_); #endif // TRITON_ENABLE_TRACING - response_cnt_++; return Status::Success; } @@ -78,18 +77,18 @@ InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, const std::function< - void(std::unique_ptr&&, const uint32_t)>& delegator, - uint64_t seq_idx + void(std::unique_ptr&&, const uint32_t)>& delegator #ifdef TRITON_ENABLE_METRICS , + std::shared_ptr> responses_sent, uint64_t infer_start_ns #endif // TRITON_ENABLE_METRICS ) : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), - response_delegator_(delegator), seq_idx_(seq_idx), + response_delegator_(delegator), #ifdef TRITON_ENABLE_METRICS - infer_start_ns_(infer_start_ns), + responses_sent_(responses_sent), infer_start_ns_(infer_start_ns), #endif // TRITON_ENABLE_METRICS null_response_(false) { @@ -108,9 +107,9 @@ InferenceResponse::InferenceResponse( InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp) - : response_fn_(response_fn), response_userp_(response_userp), seq_idx_(0), + : response_fn_(response_fn), response_userp_(response_userp), #ifdef TRITON_ENABLE_METRICS - infer_start_ns_(0), + responses_sent_(nullptr), infer_start_ns_(0), #endif // TRITON_ENABLE_METRICS null_response_(true) { @@ -309,15 +308,15 @@ InferenceResponse::TraceOutputTensors( void InferenceResponse::UpdateResponseMetrics() const { - if (model_ != nullptr && seq_idx_ == 0) { - auto first_response_ns = - std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()) - .count(); + // Report inference to first response duration. + if (model_ != nullptr && responses_sent_ != nullptr && + responses_sent_->fetch_add(1, std::memory_order_relaxed) == 0) { + auto now_ns = std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); if (auto reporter = model_->MetricReporter()) { reporter->ObserveHistogram( - "first_response_histogram", - (first_response_ns - infer_start_ns_) / 1000000); + "first_response_histogram", (now_ns - infer_start_ns_) / 1000000); } } } diff --git a/src/infer_response.h b/src/infer_response.h index b19eb9a8f..281af4740 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -60,7 +60,11 @@ class InferenceResponseFactory { : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), response_delegator_(delegator), - is_cancelled_(false), response_cnt_(0) + is_cancelled_(false) +#ifdef TRITON_ENABLE_METRICS + , + responses_sent_(0) +#endif // TRITON_ENABLE_METRICS #ifdef TRITON_ENABLE_STATS , response_stats_index_(0) @@ -89,7 +93,7 @@ class InferenceResponseFactory { } // Create a new response. - Status CreateResponse(std::unique_ptr* response); + Status CreateResponse(std::unique_ptr* response) const; // Send a "null" response with 'flags'. Status SendFlags(const uint32_t flags) const; @@ -139,10 +143,10 @@ class InferenceResponseFactory { std::atomic is_cancelled_; - // The number of responses created by this factory. - std::atomic response_cnt_; - #ifdef TRITON_ENABLE_METRICS + // Total number of responses sent created by this response factory. + std::shared_ptr> responses_sent_; + // The start time of associate request in ns. uint64_t infer_start_ns_; #endif // TRITON_ENABLE_METRICS @@ -259,11 +263,11 @@ class InferenceResponse { const ResponseAllocator* allocator, void* alloc_userp, TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, - const std::function&&, const uint32_t)>& delegator, - uint64_t seq_idx + const std::function< + void(std::unique_ptr&&, const uint32_t)>& delegator #ifdef TRITON_ENABLE_METRICS , + std::shared_ptr> responses_sent_, uint64_t infer_start_ns #endif // TRITON_ENABLE_METRICS ); @@ -343,7 +347,6 @@ class InferenceResponse { TRITONSERVER_InferenceTraceActivity activity, const std::string& msg); #endif // TRITON_ENABLE_TRACING - #ifdef TRITON_ENABLE_METRICS void UpdateResponseMetrics() const; #endif // TRITON_ENABLE_METRICS @@ -382,8 +385,11 @@ class InferenceResponse { std::function&&, const uint32_t)> response_delegator_; - const uint64_t seq_idx_; #ifdef TRITON_ENABLE_METRICS + // Total number of responses sent created by its response factory. + std::shared_ptr> responses_sent_; + + // The start time of associate request in ns. const uint64_t infer_start_ns_; #endif // TRITON_ENABLE_METRICS diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc index 166989d46..6d7d35db7 100644 --- a/src/test/response_cache_test.cc +++ b/src/test/response_cache_test.cc @@ -42,14 +42,14 @@ namespace triton { namespace core { // Status InferenceResponseFactory::CreateResponse( - std::unique_ptr* response) + std::unique_ptr* response) const { response->reset(new InferenceResponse( model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_, - response_delegator_, response_cnt_ + response_delegator_ #ifdef TRITON_ENABLE_METRICS , - infer_start_ns_ + responses_sent_, infer_start_ns_ #endif // TRITON_ENABLE_METRICS )); @@ -186,18 +186,17 @@ InferenceResponse::InferenceResponse( TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp, const std::function< - void(std::unique_ptr&&, const uint32_t)>& delegator, - uint64_t seq_idx + void(std::unique_ptr&&, const uint32_t)>& delegator #ifdef TRITON_ENABLE_METRICS , - uint64_t infer_start_ns + uint64_t responses_sent, uint64_t infer_start_ns #endif // TRITON_ENABLE_METRICS ) : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), - response_delegator_(delegator), seq_idx_(seq_idx), + response_delegator_(delegator), #ifdef TRITON_ENABLE_METRICS - infer_start_ns_(infer_start_ns), + responses_sent_(responses_sent), infer_start_ns_(infer_start_ns), #endif // TRITON_ENABLE_METRICS null_response_(false) { From 7f0612c8984c58459394acdc33a03bc07b4431cc Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 17 Oct 2024 11:01:21 -0700 Subject: [PATCH 07/10] Fix incorrect initialization of shared_ptr --- src/infer_response.cc | 3 ++- src/infer_response.h | 4 ++-- src/metric_model_reporter.h | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/infer_response.cc b/src/infer_response.cc index 092338d97..1a3f85175 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -88,7 +88,8 @@ InferenceResponse::InferenceResponse( response_fn_(response_fn), response_userp_(response_userp), response_delegator_(delegator), #ifdef TRITON_ENABLE_METRICS - responses_sent_(responses_sent), infer_start_ns_(infer_start_ns), + responses_sent_(std::move(responses_sent)), + infer_start_ns_(infer_start_ns), #endif // TRITON_ENABLE_METRICS null_response_(false) { diff --git a/src/infer_response.h b/src/infer_response.h index 281af4740..8d09285ee 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -63,7 +63,7 @@ class InferenceResponseFactory { is_cancelled_(false) #ifdef TRITON_ENABLE_METRICS , - responses_sent_(0) + responses_sent_(std::make_shared>(0)) #endif // TRITON_ENABLE_METRICS #ifdef TRITON_ENABLE_STATS , @@ -387,7 +387,7 @@ class InferenceResponse { #ifdef TRITON_ENABLE_METRICS // Total number of responses sent created by its response factory. - std::shared_ptr> responses_sent_; + const std::shared_ptr> responses_sent_; // The start time of associate request in ns. const uint64_t infer_start_ns_; diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h index 5ab9f0201..faeb5f399 100644 --- a/src/metric_model_reporter.h +++ b/src/metric_model_reporter.h @@ -57,11 +57,13 @@ struct MetricReporterConfig { bool latency_histograms_enabled_ = true; // Create and use Summaries for per-model latency related metrics bool latency_summaries_enabled_ = false; + // Buckets used for any histogram metrics. Each value represents + // a bucket boundary. + prometheus::Histogram::BucketBoundaries buckets_ = {100, 500, 2000, 5000}; // Quantiles used for any summary metrics. Each pair of values represents // { quantile, error }. For example, {0.90, 0.01} means to compute the // 90th percentile with 1% error on either side, so the approximate 90th // percentile value will be between the 89th and 91st percentiles. - prometheus::Histogram::BucketBoundaries buckets_ = {10, 100, 500, 1000}; prometheus::Summary::Quantiles quantiles_ = { {0.5, 0.05}, {0.9, 0.01}, {0.95, 0.001}, {0.99, 0.001}, {0.999, 0.001}}; From 72d99d7b9901df342cef1edaf3e4389bd67209ad Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 17 Oct 2024 11:06:31 -0700 Subject: [PATCH 08/10] Disable histograms by default --- src/metric_model_reporter.cc | 4 ++-- src/metric_model_reporter.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/metric_model_reporter.cc b/src/metric_model_reporter.cc index 75d2e87a5..93de30796 100644 --- a/src/metric_model_reporter.cc +++ b/src/metric_model_reporter.cc @@ -54,8 +54,8 @@ MetricReporterConfig::ParseConfig( latency_counters_enabled_ = false; } - if (pair.first == "histogram_latencies" && pair.second == "false") { - latency_histograms_enabled_ = false; + if (pair.first == "histogram_latencies" && pair.second == "true") { + latency_histograms_enabled_ = true; } if (pair.first == "summary_latencies" && pair.second == "true") { diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h index faeb5f399..6a1c148ce 100644 --- a/src/metric_model_reporter.h +++ b/src/metric_model_reporter.h @@ -54,7 +54,7 @@ struct MetricReporterConfig { // Create and use Counters for per-model latency related metrics bool latency_counters_enabled_ = true; // Create and use Histograms for per-model latency related metrics - bool latency_histograms_enabled_ = true; + bool latency_histograms_enabled_ = false; // Create and use Summaries for per-model latency related metrics bool latency_summaries_enabled_ = false; // Buckets used for any histogram metrics. Each value represents From b6b5af994d970f57e22c4db6d7834c15030499f2 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 17 Oct 2024 11:49:18 -0700 Subject: [PATCH 09/10] Minor fixes --- src/infer_response.h | 2 +- src/metric_model_reporter.h | 3 ++- src/model.h | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/infer_response.h b/src/infer_response.h index 8d09285ee..700d14a3c 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -267,7 +267,7 @@ class InferenceResponse { void(std::unique_ptr&&, const uint32_t)>& delegator #ifdef TRITON_ENABLE_METRICS , - std::shared_ptr> responses_sent_, + std::shared_ptr> responses_sent, uint64_t infer_start_ns #endif // TRITON_ENABLE_METRICS ); diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h index 6a1c148ce..0a3018125 100644 --- a/src/metric_model_reporter.h +++ b/src/metric_model_reporter.h @@ -58,7 +58,8 @@ struct MetricReporterConfig { // Create and use Summaries for per-model latency related metrics bool latency_summaries_enabled_ = false; // Buckets used for any histogram metrics. Each value represents - // a bucket boundary. + // a bucket boundary. For example, {100, 500, 2000, 5000} are latencies + // in milliseconds in first_response_histogram. prometheus::Histogram::BucketBoundaries buckets_ = {100, 500, 2000, 5000}; // Quantiles used for any summary metrics. Each pair of values represents // { quantile, error }. For example, {0.90, 0.01} means to compute the diff --git a/src/model.h b/src/model.h index 4781020af..2acf63b11 100644 --- a/src/model.h +++ b/src/model.h @@ -148,6 +148,7 @@ class Model { return config_.response_cache().enable(); } + // Get whether the model is decoupled. bool IsDecoupled() const { return config_.model_transaction_policy().decoupled(); From fb87d2a80e42cb2881f8162c0e98da09cdf51c47 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 17 Oct 2024 16:14:04 -0700 Subject: [PATCH 10/10] Minor update --- src/infer_response.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/infer_response.cc b/src/infer_response.cc index 1a3f85175..498036cde 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -317,7 +317,8 @@ InferenceResponse::UpdateResponseMetrics() const .count(); if (auto reporter = model_->MetricReporter()) { reporter->ObserveHistogram( - "first_response_histogram", (now_ns - infer_start_ns_) / 1000000); + "first_response_histogram", + (now_ns - infer_start_ns_) / NANOS_PER_MILLIS); } } }