diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
index 1aa8a9c48..87b602c82 100644
--- a/src/backend_model_instance.cc
+++ b/src/backend_model_instance.cc
@@ -192,7 +192,7 @@ TritonModelInstance::TritonModelInstance(
         model_->Server()->ResponseCacheEnabled();
     MetricModelReporter::Create(
         model_->ModelId(), model_->Version(), id, response_cache_enabled,
-        model_->Config().metric_tags(), &reporter_);
+        model_->IsDecoupled(), model_->Config().metric_tags(), &reporter_);
   }
 #endif  // TRITON_ENABLE_METRICS
 }
diff --git a/src/ensemble_scheduler/ensemble_scheduler.cc b/src/ensemble_scheduler/ensemble_scheduler.cc
index 609c1ff28..64f118144 100644
--- a/src/ensemble_scheduler/ensemble_scheduler.cc
+++ b/src/ensemble_scheduler/ensemble_scheduler.cc
@@ -1470,12 +1470,13 @@ EnsembleScheduler::EnsembleScheduler(
   }
 #endif  // TRITON_ENABLE_GPU
 
+  const bool is_decoupled = config.model_transaction_policy().decoupled();
 #ifdef TRITON_ENABLE_METRICS
   if (Metrics::Enabled()) {
     // Ensemble scheduler doesn't currently support response cache at top level.
     MetricModelReporter::Create(
         model_id, 1 /* model_version */, METRIC_REPORTER_ID_CPU,
-        false /* response_cache_enabled */, config.metric_tags(),
+        false /* response_cache_enabled */, is_decoupled, config.metric_tags(),
         &metric_reporter_);
   }
 #endif  // TRITON_ENABLE_METRICS
@@ -1486,7 +1487,7 @@ EnsembleScheduler::EnsembleScheduler(
   info_->ensemble_name_ = config.name();
 
   // This config field is filled internally for ensemble models
-  info_->is_decoupled_ = config.model_transaction_policy().decoupled();
+  info_->is_decoupled_ = is_decoupled;
 
   // field to check if response cache enabled in the ensemble model config.
   info_->is_cache_enabled_ =
diff --git a/src/infer_response.cc b/src/infer_response.cc
index 6eec9d1a3..498036cde 100644
--- a/src/infer_response.cc
+++ b/src/infer_response.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -42,7 +42,12 @@ InferenceResponseFactory::CreateResponse(
 {
   response->reset(new InferenceResponse(
       model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_,
-      response_delegator_));
+      response_delegator_
+#ifdef TRITON_ENABLE_METRICS
+      ,
+      responses_sent_, infer_start_ns_
+#endif  // TRITON_ENABLE_METRICS
+      ));
 #ifdef TRITON_ENABLE_TRACING
   (*response)->SetTrace(trace_);
 #endif  // TRITON_ENABLE_TRACING
@@ -72,10 +77,21 @@ InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp,
     const std::function<
-        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator)
+        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
+#ifdef TRITON_ENABLE_METRICS
+    ,
+    std::shared_ptr<std::atomic<uint64_t>> responses_sent,
+    uint64_t infer_start_ns
+#endif  // TRITON_ENABLE_METRICS
+    )
     : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
       response_fn_(response_fn), response_userp_(response_userp),
-      response_delegator_(delegator), null_response_(false)
+      response_delegator_(delegator),
+#ifdef TRITON_ENABLE_METRICS
+      responses_sent_(std::move(responses_sent)),
+      infer_start_ns_(infer_start_ns),
+#endif  // TRITON_ENABLE_METRICS
+      null_response_(false)
 {
   // If the allocator has a start_fn then invoke it.
   TRITONSERVER_ResponseAllocatorStartFn_t start_fn = allocator_->StartFn();
@@ -93,6 +109,9 @@ InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp)
     : response_fn_(response_fn), response_userp_(response_userp),
+#ifdef TRITON_ENABLE_METRICS
+      responses_sent_(nullptr), infer_start_ns_(0),
+#endif  // TRITON_ENABLE_METRICS
       null_response_(true)
 {
 }
@@ -214,6 +233,10 @@ InferenceResponse::Send(
       TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT, "InferenceResponse Send");
 #endif  // TRITON_ENABLE_TRACING
 
+#ifdef TRITON_ENABLE_METRICS
+  response->UpdateResponseMetrics();
+#endif  // TRITON_ENABLE_METRICS
+
   if (response->response_delegator_ != nullptr) {
     auto ldelegator = std::move(response->response_delegator_);
     ldelegator(std::move(response), flags);
@@ -282,6 +305,25 @@ InferenceResponse::TraceOutputTensors(
 }
 #endif  // TRITON_ENABLE_TRACING
 
+#ifdef TRITON_ENABLE_METRICS
+void
+InferenceResponse::UpdateResponseMetrics() const
+{
+  // Report inference to first response duration.
+  if (model_ != nullptr && responses_sent_ != nullptr &&
+      responses_sent_->fetch_add(1, std::memory_order_relaxed) == 0) {
+    auto now_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                      std::chrono::steady_clock::now().time_since_epoch())
+                      .count();
+    if (auto reporter = model_->MetricReporter()) {
+      reporter->ObserveHistogram(
+          "first_response_histogram",
+          (now_ns - infer_start_ns_) / NANOS_PER_MILLIS);
+    }
+  }
+}
+#endif  // TRITON_ENABLE_METRICS
+
 //
 // InferenceResponse::Output
 //
diff --git a/src/infer_response.h b/src/infer_response.h
index 612f8c1fe..700d14a3c 100644
--- a/src/infer_response.h
+++ b/src/infer_response.h
@@ -61,11 +61,20 @@ class InferenceResponseFactory {
         alloc_userp_(alloc_userp), response_fn_(response_fn),
         response_userp_(response_userp), response_delegator_(delegator),
         is_cancelled_(false)
+#ifdef TRITON_ENABLE_METRICS
+        ,
+        responses_sent_(std::make_shared<std::atomic<uint64_t>>(0))
+#endif  // TRITON_ENABLE_METRICS
 #ifdef TRITON_ENABLE_STATS
         ,
         response_stats_index_(0)
 #endif  // TRITON_ENABLE_STATS
   {
+#ifdef TRITON_ENABLE_METRICS
+    infer_start_ns_ = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                          std::chrono::steady_clock::now().time_since_epoch())
+                          .count();
+#endif  // TRITON_ENABLE_METRICS
   }
 
   void Cancel() { is_cancelled_ = true; }
@@ -134,6 +143,14 @@ class InferenceResponseFactory {
 
   std::atomic<bool> is_cancelled_;
 
+#ifdef TRITON_ENABLE_METRICS
+  // Total number of responses sent created by this response factory.
+  std::shared_ptr<std::atomic<uint64_t>> responses_sent_;
+
+  // The start time of associate request in ns.
+  uint64_t infer_start_ns_;
+#endif  // TRITON_ENABLE_METRICS
+
 #ifdef TRITON_ENABLE_TRACING
   // Inference trace associated with this response.
   std::shared_ptr<InferenceTraceProxy> trace_;
@@ -246,8 +263,14 @@ class InferenceResponse {
       const ResponseAllocator* allocator, void* alloc_userp,
       TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
       void* response_userp,
-      const std::function<void(
-          std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator);
+      const std::function<
+          void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
+#ifdef TRITON_ENABLE_METRICS
+      ,
+      std::shared_ptr<std::atomic<uint64_t>> responses_sent,
+      uint64_t infer_start_ns
+#endif  // TRITON_ENABLE_METRICS
+  );
 
   // "null" InferenceResponse is a special instance of InferenceResponse which
   // contains minimal information for calling InferenceResponse::Send,
@@ -324,6 +347,10 @@ class InferenceResponse {
       TRITONSERVER_InferenceTraceActivity activity, const std::string& msg);
 #endif  // TRITON_ENABLE_TRACING
 
+#ifdef TRITON_ENABLE_METRICS
+  void UpdateResponseMetrics() const;
+#endif  // TRITON_ENABLE_METRICS
+
   // The model associated with this factory. For normal
   // requests/responses this will always be defined and acts to keep
   // the model loaded as long as this factory is live. It may be
@@ -358,6 +385,14 @@ class InferenceResponse {
   std::function<void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>
       response_delegator_;
 
+#ifdef TRITON_ENABLE_METRICS
+  // Total number of responses sent created by its response factory.
+  const std::shared_ptr<std::atomic<uint64_t>> responses_sent_;
+
+  // The start time of associate request in ns.
+  const uint64_t infer_start_ns_;
+#endif  // TRITON_ENABLE_METRICS
+
   bool null_response_;
 
 #ifdef TRITON_ENABLE_TRACING
diff --git a/src/metric_model_reporter.cc b/src/metric_model_reporter.cc
index 71d3eca94..a10db8f64 100644
--- a/src/metric_model_reporter.cc
+++ b/src/metric_model_reporter.cc
@@ -41,7 +41,8 @@ namespace triton { namespace core {
 // MetricReporterConfig
 //
 void
-MetricReporterConfig::ParseConfig(bool response_cache_enabled)
+MetricReporterConfig::ParseConfig(
+    bool response_cache_enabled, bool is_decoupled)
 {
   // Global config only for now in config map
   auto metrics_config_map = Metrics::ConfigMap();
@@ -53,6 +54,10 @@ MetricReporterConfig::ParseConfig(bool response_cache_enabled)
       latency_counters_enabled_ = false;
     }
 
+    if (pair.first == "histogram_latencies" && pair.second == "true") {
+      latency_histograms_enabled_ = true;
+    }
+
     if (pair.first == "summary_latencies" && pair.second == "true") {
       latency_summaries_enabled_ = true;
     }
@@ -68,6 +73,7 @@ MetricReporterConfig::ParseConfig(bool response_cache_enabled)
 
   // Set flag to signal to stats aggregator if caching is enabled or not
   cache_enabled_ = response_cache_enabled;
+  is_decoupled_ = is_decoupled;
 }
 
 prometheus::Summary::Quantiles
@@ -112,7 +118,7 @@ const std::map<FailureReason, std::string>
 Status
 MetricModelReporter::Create(
     const ModelIdentifier& model_id, const int64_t model_version,
-    const int device, bool response_cache_enabled,
+    const int device, bool response_cache_enabled, bool is_decoupled,
     const triton::common::MetricTagsMap& model_tags,
     std::shared_ptr<MetricModelReporter>* metric_model_reporter)
 {
@@ -141,25 +147,27 @@ MetricModelReporter::Create(
   }
 
   metric_model_reporter->reset(new MetricModelReporter(
-      model_id, model_version, device, response_cache_enabled, model_tags));
+      model_id, model_version, device, response_cache_enabled, is_decoupled,
+      model_tags));
   reporter_map.insert({hash_labels, *metric_model_reporter});
   return Status::Success;
 }
 
 MetricModelReporter::MetricModelReporter(
     const ModelIdentifier& model_id, const int64_t model_version,
-    const int device, bool response_cache_enabled,
+    const int device, bool response_cache_enabled, bool is_decoupled,
     const triton::common::MetricTagsMap& model_tags)
 {
   std::map<std::string, std::string> labels;
   GetMetricLabels(&labels, model_id, model_version, device, model_tags);
 
   // Parse metrics config to control metric setup and behavior
-  config_.ParseConfig(response_cache_enabled);
+  config_.ParseConfig(response_cache_enabled, is_decoupled);
 
   // Initialize families and metrics
   InitializeCounters(labels);
   InitializeGauges(labels);
+  InitializeHistograms(labels);
   InitializeSummaries(labels);
 }
 
@@ -182,6 +190,14 @@ MetricModelReporter::~MetricModelReporter()
     }
   }
 
+  for (auto& iter : histogram_families_) {
+    const auto& name = iter.first;
+    auto family_ptr = iter.second;
+    if (family_ptr) {
+      family_ptr->Remove(histograms_[name]);
+    }
+  }
+
   for (auto& iter : summary_families_) {
     const auto& name = iter.first;
     auto family_ptr = iter.second;
@@ -262,6 +278,28 @@ MetricModelReporter::InitializeGauges(
   }
 }
 
+void
+MetricModelReporter::InitializeHistograms(
+    const std::map<std::string, std::string>& labels)
+{
+  // Only create response metrics if decoupled model to reduce metric output
+  if (config_.latency_histograms_enabled_) {
+    if (config_.is_decoupled_) {
+      histogram_families_["first_response_histogram"] =
+          &Metrics::FamilyFirstResponseDuration();
+    }
+  }
+
+  for (auto& iter : histogram_families_) {
+    const auto& name = iter.first;
+    auto family_ptr = iter.second;
+    if (family_ptr) {
+      histograms_[name] = CreateMetric<prometheus::Histogram>(
+          *family_ptr, labels, config_.buckets_);
+    }
+  }
+}
+
 void
 MetricModelReporter::InitializeSummaries(
     const std::map<std::string, std::string>& labels)
@@ -408,6 +446,23 @@ MetricModelReporter::DecrementGauge(const std::string& name, double value)
   IncrementGauge(name, -1 * value);
 }
 
+void
+MetricModelReporter::ObserveHistogram(const std::string& name, double value)
+{
+  auto iter = histograms_.find(name);
+  if (iter == histograms_.end()) {
+    // No histogram metric exists with this name
+    return;
+  }
+
+  auto histogram = iter->second;
+  if (!histogram) {
+    // histogram is uninitialized/nullptr
+    return;
+  }
+  histogram->Observe(value);
+}
+
 void
 MetricModelReporter::ObserveSummary(const std::string& name, double value)
 {
diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h
index 6482cabb6..236bc8f5f 100644
--- a/src/metric_model_reporter.h
+++ b/src/metric_model_reporter.h
@@ -46,15 +46,21 @@ struct ModelIdentifier;
 struct MetricReporterConfig {
 #ifdef TRITON_ENABLE_METRICS
   // Parses Metrics::ConfigMap and sets fields if specified
-  void ParseConfig(bool response_cache_enabled);
+  void ParseConfig(bool response_cache_enabled, bool is_decoupled);
   // Parses pairs of quantiles "quantile1:error1, quantile2:error2, ..."
   // and overwrites quantiles_ field if successful.
   prometheus::Summary::Quantiles ParseQuantiles(std::string options);
 
   // Create and use Counters for per-model latency related metrics
   bool latency_counters_enabled_ = true;
+  // Create and use Histograms for per-model latency related metrics
+  bool latency_histograms_enabled_ = false;
   // Create and use Summaries for per-model latency related metrics
   bool latency_summaries_enabled_ = false;
+  // Buckets used for any histogram metrics. Each value represents
+  // a bucket boundary. For example, {100, 500, 2000, 5000} are latencies
+  // in milliseconds in first_response_histogram.
+  prometheus::Histogram::BucketBoundaries buckets_ = {100, 500, 2000, 5000};
   // Quantiles used for any summary metrics. Each pair of values represents
   // { quantile, error }. For example, {0.90, 0.01} means to compute the
   // 90th percentile with 1% error on either side, so the approximate 90th
@@ -65,6 +71,8 @@ struct MetricReporterConfig {
   // Whether this reporter's model has caching enabled or not.
   // This helps handle infer_stats aggregation for summaries on cache misses.
   bool cache_enabled_ = false;
+
+  bool is_decoupled_ = false;
 #endif  // TRITON_ENABLE_METRICS
 };
 
@@ -77,7 +85,7 @@ class MetricModelReporter {
   static Status Create(
       const triton::core::ModelIdentifier& model_id,
       const int64_t model_version, const int device,
-      bool response_cache_enabled,
+      bool response_cache_enabled, bool is_decoupled,
       const triton::common::MetricTagsMap& model_tags,
       std::shared_ptr<MetricModelReporter>* metric_model_reporter);
 
@@ -93,6 +101,8 @@ class MetricModelReporter {
   void IncrementGauge(const std::string& name, double value);
   // Decrease gauge by value.
   void DecrementGauge(const std::string& name, double value);
+  // Lookup histogram metric by name, and observe the value if it exists.
+  void ObserveHistogram(const std::string& name, double value);
   // Lookup summary metric by name, and observe the value if it exists.
   void ObserveSummary(const std::string& name, double value);
 
@@ -101,7 +111,7 @@ class MetricModelReporter {
  private:
   MetricModelReporter(
       const ModelIdentifier& model_id, const int64_t model_version,
-      const int device, bool response_cache_enabled,
+      const int device, bool response_cache_enabled, bool is_decoupled,
       const triton::common::MetricTagsMap& model_tags);
 
   static void GetMetricLabels(
@@ -116,6 +126,7 @@ class MetricModelReporter {
 
   void InitializeCounters(const std::map<std::string, std::string>& labels);
   void InitializeGauges(const std::map<std::string, std::string>& labels);
+  void InitializeHistograms(const std::map<std::string, std::string>& labels);
   void InitializeSummaries(const std::map<std::string, std::string>& labels);
 
   // Lookup gauge metric by name. Return gauge if found, nullptr otherwise.
@@ -127,12 +138,15 @@ class MetricModelReporter {
       counter_families_;
   std::unordered_map<std::string, prometheus::Family<prometheus::Gauge>*>
       gauge_families_;
+  std::unordered_map<std::string, prometheus::Family<prometheus::Histogram>*>
+      histogram_families_;
   std::unordered_map<std::string, prometheus::Family<prometheus::Summary>*>
       summary_families_;
 
   // Metrics
   std::unordered_map<std::string, prometheus::Counter*> counters_;
   std::unordered_map<std::string, prometheus::Gauge*> gauges_;
+  std::unordered_map<std::string, prometheus::Histogram*> histograms_;
   std::unordered_map<std::string, prometheus::Summary*> summaries_;
 
   // Config
diff --git a/src/metrics.cc b/src/metrics.cc
index 775c46b97..f9b141b73 100644
--- a/src/metrics.cc
+++ b/src/metrics.cc
@@ -1,4 +1,4 @@
-// Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -109,6 +109,12 @@ Metrics::Metrics()
                     "execution per-model.")
               .Register(*registry_)),
 
+      inf_first_response_histogram_ms_family_(
+          prometheus::BuildHistogram()
+              .Name("nv_inference_first_response_histogram_ms")
+              .Help("Duration from request to first response in milliseconds")
+              .Register(*registry_)),
+
       model_load_time_family_(prometheus::BuildGauge()
                                   .Name("nv_model_load_duration_secs")
                                   .Help("Model load time in seconds")
diff --git a/src/metrics.h b/src/metrics.h
index 0ae516368..af983cdca 100644
--- a/src/metrics.h
+++ b/src/metrics.h
@@ -215,6 +215,12 @@ class Metrics {
     return GetSingleton()->inf_pending_request_count_family_;
   }
 
+  static prometheus::Family<prometheus::Histogram>&
+  FamilyFirstResponseDuration()
+  {
+    return GetSingleton()->inf_first_response_histogram_ms_family_;
+  }
+
   // Metric family of load time per model
   static prometheus::Family<prometheus::Gauge>& FamilyModelLoadTime()
   {
@@ -306,6 +312,8 @@ class Metrics {
   prometheus::Family<prometheus::Counter>&
       inf_compute_output_duration_us_family_;
   prometheus::Family<prometheus::Gauge>& inf_pending_request_count_family_;
+  prometheus::Family<prometheus::Histogram>&
+      inf_first_response_histogram_ms_family_;
   prometheus::Family<prometheus::Gauge>& model_load_time_family_;
 
   prometheus::Family<prometheus::Gauge>& pinned_memory_pool_total_family_;
diff --git a/src/model.cc b/src/model.cc
index 9da281ed5..37fb87de2 100644
--- a/src/model.cc
+++ b/src/model.cc
@@ -135,7 +135,7 @@ Model::Init(const bool is_config_provided)
 #ifdef TRITON_ENABLE_METRICS
   MetricModelReporter::Create(
       ModelId(), Version(), METRIC_REPORTER_ID_UTILITY, ResponseCacheEnabled(),
-      Config().metric_tags(), &reporter_);
+      IsDecoupled(), Config().metric_tags(), &reporter_);
 #endif  // TRITON_ENABLE_METRICS
 
   return Status::Success;
diff --git a/src/model.h b/src/model.h
index 286ca8db1..2acf63b11 100644
--- a/src/model.h
+++ b/src/model.h
@@ -148,6 +148,12 @@ class Model {
     return config_.response_cache().enable();
   }
 
+  // Get whether the model is decoupled.
+  bool IsDecoupled() const
+  {
+    return config_.model_transaction_policy().decoupled();
+  }
+
   // Get the number of required inputs
   size_t RequiredInputCount() const { return required_input_count_; }
 
diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc
index 8ffb85bd6..6d7d35db7 100644
--- a/src/test/response_cache_test.cc
+++ b/src/test/response_cache_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -46,7 +46,12 @@ InferenceResponseFactory::CreateResponse(
 {
   response->reset(new InferenceResponse(
       model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_,
-      response_delegator_));
+      response_delegator_
+#ifdef TRITON_ENABLE_METRICS
+      ,
+      responses_sent_, infer_start_ns_
+#endif  // TRITON_ENABLE_METRICS
+      ));
 
   return Status::Success;
 }
@@ -181,10 +186,19 @@ InferenceResponse::InferenceResponse(
     TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
     void* response_userp,
     const std::function<
-        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator)
+        void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator
+#ifdef TRITON_ENABLE_METRICS
+    ,
+    uint64_t responses_sent, uint64_t infer_start_ns
+#endif  // TRITON_ENABLE_METRICS
+    )
     : model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
       response_fn_(response_fn), response_userp_(response_userp),
-      response_delegator_(delegator), null_response_(false)
+      response_delegator_(delegator),
+#ifdef TRITON_ENABLE_METRICS
+      responses_sent_(responses_sent), infer_start_ns_(infer_start_ns),
+#endif  // TRITON_ENABLE_METRICS
+      null_response_(false)
 {
   // Skip allocator logic / references in unit test
 }