From 474ea93c0ae7aa26035a7458a740fc547ebb53b0 Mon Sep 17 00:00:00 2001
From: Matthew Kotila <matthew.r.kotila@gmail.com>
Date: Fri, 21 Jul 2023 11:51:11 -0700
Subject: [PATCH] Calculate response throughput metric (#356)

* Calculate response throughput metric

* Address feedback

* Cleanup
---
 src/c++/library/common.h                      |   6 +-
 src/c++/library/grpc_client.cc                |  16 ++-
 src/c++/library/http_client.cc                |  12 ++
 .../client_backend/client_backend.h           |   9 +-
 .../client_backend/mock_client_backend.h      |   9 ++
 .../triton/triton_client_backend.cc           |   7 +
 .../triton/triton_client_backend.h            |   2 +
 src/c++/perf_analyzer/infer_context.cc        |  24 ++--
 src/c++/perf_analyzer/inference_profiler.cc   |  18 ++-
 src/c++/perf_analyzer/inference_profiler.h    |  23 ++--
 src/c++/perf_analyzer/load_manager.h          |   9 +-
 .../perf_analyzer/mock_inference_profiler.h   |  89 ++++++++++++-
 src/c++/perf_analyzer/mock_load_manager.h     |  37 ++++++
 src/c++/perf_analyzer/perf_utils.h            |   3 +-
 .../perf_analyzer/test_inference_profiler.cc  | 122 +++++++++++++++++-
 15 files changed, 355 insertions(+), 31 deletions(-)
 create mode 100644 src/c++/perf_analyzer/mock_load_manager.h

diff --git a/src/c++/library/common.h b/src/c++/library/common.h
index 1f0494000..ba98d82ca 100644
--- a/src/c++/library/common.h
+++ b/src/c++/library/common.h
@@ -513,10 +513,14 @@ class InferResult {
       const std::string& output_name, const uint8_t** buf,
       size_t* byte_size) const = 0;
 
-  /// Get final response bool of the request which generated this response.
+  /// Get final response bool for this response.
   /// \return Error object indicating the success or failure.
   virtual Error IsFinalResponse(bool* is_final_response) const = 0;
 
+  /// Get null response bool for this response.
+  /// \return Error object indicating the success or failure.
+  virtual Error IsNullResponse(bool* is_null_response) const = 0;
+
   /// Get the result data as a vector of strings. The vector will
   /// receive a copy of result data. An error will be generated if
   /// the datatype of output is not 'BYTES'.
diff --git a/src/c++/library/grpc_client.cc b/src/c++/library/grpc_client.cc
index eba18a33e..537608fba 100644
--- a/src/c++/library/grpc_client.cc
+++ b/src/c++/library/grpc_client.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -190,6 +190,7 @@ class InferResultGrpc : public InferResult {
       const std::string& output_name, const uint8_t** buf,
       size_t* byte_size) const override;
   Error IsFinalResponse(bool* is_final_response) const override;
+  Error IsNullResponse(bool* is_null_response) const override;
   Error StringData(
       const std::string& output_name,
       std::vector<std::string>* string_result) const override;
@@ -211,6 +212,7 @@ class InferResultGrpc : public InferResult {
   std::shared_ptr<inference::ModelStreamInferResponse> stream_response_;
   Error request_status_;
   bool is_final_response_{true};
+  bool is_null_response_{false};
 };
 
 Error
@@ -322,6 +324,16 @@ InferResultGrpc::IsFinalResponse(bool* is_final_response) const
   return Error::Success;
 }
 
+Error
+InferResultGrpc::IsNullResponse(bool* is_null_response) const
+{
+  if (is_null_response == nullptr) {
+    return Error("is_null_response cannot be nullptr");
+  }
+  *is_null_response = is_null_response_;
+  return Error::Success;
+}
+
 Error
 InferResultGrpc::StringData(
     const std::string& output_name,
@@ -384,6 +396,7 @@ InferResultGrpc::InferResultGrpc(
   if (is_final_response_itr != response_->parameters().end()) {
     is_final_response_ = is_final_response_itr->second.bool_param();
   }
+  is_null_response_ = response_->outputs().empty() && is_final_response_;
 }
 
 InferResultGrpc::InferResultGrpc(
@@ -409,6 +422,7 @@ InferResultGrpc::InferResultGrpc(
   if (is_final_response_itr != response_->parameters().end()) {
     is_final_response_ = is_final_response_itr->second.bool_param();
   }
+  is_null_response_ = response_->outputs().empty() && is_final_response_;
 }
 
 //==============================================================================
diff --git a/src/c++/library/http_client.cc b/src/c++/library/http_client.cc
index 4258ead12..eacf958da 100644
--- a/src/c++/library/http_client.cc
+++ b/src/c++/library/http_client.cc
@@ -740,6 +740,7 @@ class InferResultHttp : public InferResult {
       const std::string& output_name, const uint8_t** buf,
       size_t* byte_size) const override;
   Error IsFinalResponse(bool* is_final_response) const override;
+  Error IsNullResponse(bool* is_null_response) const override;
   Error StringData(
       const std::string& output_name,
       std::vector<std::string>* string_result) const override;
@@ -769,6 +770,7 @@ class InferResultHttp : public InferResult {
 
   bool binary_data_{true};
   bool is_final_response_{true};
+  bool is_null_response_{false};
 };
 
 void
@@ -951,6 +953,16 @@ InferResultHttp::IsFinalResponse(bool* is_final_response) const
   return Error::Success;
 }
 
+Error
+InferResultHttp::IsNullResponse(bool* is_null_response) const
+{
+  if (is_null_response == nullptr) {
+    return Error("is_null_response cannot be nullptr");
+  }
+  *is_null_response = is_null_response_;
+  return Error::Success;
+}
+
 Error
 InferResultHttp::StringData(
     const std::string& output_name,
diff --git a/src/c++/perf_analyzer/client_backend/client_backend.h b/src/c++/perf_analyzer/client_backend/client_backend.h
index 4903da61d..c351dfe61 100644
--- a/src/c++/perf_analyzer/client_backend/client_backend.h
+++ b/src/c++/perf_analyzer/client_backend/client_backend.h
@@ -617,12 +617,19 @@ class InferResult {
       const std::string& output_name, const uint8_t** buf,
       size_t* byte_size) const = 0;
 
-  /// Get final response bool of the request which generated this response.
+  /// Get final response bool for this response.
   /// \return Error object indicating the success or failure.
   virtual Error IsFinalResponse(bool* is_final_response) const
   {
     return Error("InferResult::IsFinalResponse() not implemented");
   };
+
+  /// Get null response bool for this response.
+  /// \return Error object indicating the success or failure.
+  virtual Error IsNullResponse(bool* is_null_response) const
+  {
+    return Error("InferResult::IsNullResponse() not implemented");
+  };
 };
 
 }}}  // namespace triton::perfanalyzer::clientbackend
diff --git a/src/c++/perf_analyzer/client_backend/mock_client_backend.h b/src/c++/perf_analyzer/client_backend/mock_client_backend.h
index 28a568eb0..ddc14f663 100644
--- a/src/c++/perf_analyzer/client_backend/mock_client_backend.h
+++ b/src/c++/perf_analyzer/client_backend/mock_client_backend.h
@@ -127,6 +127,15 @@ class MockInferResult : public InferResult {
     return Error::Success;
   }
 
+  Error IsNullResponse(bool* is_null_response) const override
+  {
+    if (is_null_response == nullptr) {
+      return Error("is_null_response cannot be nullptr");
+    }
+    *is_null_response = false;
+    return Error::Success;
+  }
+
  private:
   std::string req_id_;
 };
diff --git a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc
index 7ac2188a6..1be578a95 100644
--- a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc
+++ b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc
@@ -827,6 +827,13 @@ TritonInferResult::IsFinalResponse(bool* is_final_response) const
   return Error::Success;
 }
 
+Error
+TritonInferResult::IsNullResponse(bool* is_null_response) const
+{
+  RETURN_IF_TRITON_ERROR(result_->IsNullResponse(is_null_response));
+  return Error::Success;
+}
+
 //==============================================================================
 
 }}}}  // namespace triton::perfanalyzer::clientbackend::tritonremote
diff --git a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h
index 5b25e8297..aab3c8028 100644
--- a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h
+++ b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h
@@ -331,6 +331,8 @@ class TritonInferResult : public InferResult {
       size_t* byte_size) const override;
   /// See InferResult::IsFinalResponse()
   Error IsFinalResponse(bool* is_final_response) const override;
+  /// See InferResult::IsNullResponse()
+  Error IsNullResponse(bool* is_null_response) const override;
 
  private:
   std::unique_ptr<tc::InferResult> result_;
diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc
index 7c4a36944..dc65c2adc 100644
--- a/src/c++/perf_analyzer/infer_context.cc
+++ b/src/c++/perf_analyzer/infer_context.cc
@@ -1,4 +1,4 @@
-// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -236,20 +236,26 @@ void
 InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
 {
   std::shared_ptr<cb::InferResult> result_ptr(result);
+  bool is_final_response{true};
   if (thread_stat_->cb_status_.IsOk()) {
     // Add the request timestamp to thread Timestamp vector with
     // proper locking
     std::lock_guard<std::mutex> lock(thread_stat_->mu_);
     thread_stat_->cb_status_ = result_ptr->RequestStatus();
     if (thread_stat_->cb_status_.IsOk()) {
-      std::chrono::time_point<std::chrono::system_clock> end_time_async;
-      end_time_async = std::chrono::system_clock::now();
       std::string request_id;
       thread_stat_->cb_status_ = result_ptr->Id(&request_id);
       const auto& it = async_req_map_.find(request_id);
       if (it != async_req_map_.end()) {
-        it->second.end_times.push_back(end_time_async);
-        bool is_final_response{false};
+        bool is_null_response{false};
+        thread_stat_->cb_status_ =
+            result_ptr->IsNullResponse(&is_null_response);
+        if (thread_stat_->cb_status_.IsOk() == false) {
+          return;
+        }
+        if (is_null_response == false) {
+          it->second.end_times.push_back(std::chrono::system_clock::now());
+        }
         thread_stat_->cb_status_ =
             result_ptr->IsFinalResponse(&is_final_response);
         if (thread_stat_->cb_status_.IsOk() == false) {
@@ -267,10 +273,12 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
     }
   }
 
-  total_ongoing_requests_--;
+  if (is_final_response) {
+    total_ongoing_requests_--;
 
-  if (async_callback_finalize_func_ != nullptr) {
-    async_callback_finalize_func_(id_);
+    if (async_callback_finalize_func_ != nullptr) {
+      async_callback_finalize_func_(id_);
+    }
   }
 }
 
diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc
index b60006286..b0dd3f224 100644
--- a/src/c++/perf_analyzer/inference_profiler.cc
+++ b/src/c++/perf_analyzer/inference_profiler.cc
@@ -1012,6 +1012,8 @@ InferenceProfiler::MergePerfStatusReports(
         perf_status.client_stats.sequence_count;
     experiment_perf_status.client_stats.delayed_request_count +=
         perf_status.client_stats.delayed_request_count;
+    experiment_perf_status.client_stats.response_count +=
+        perf_status.client_stats.response_count;
     experiment_perf_status.client_stats.duration_ns +=
         perf_status.client_stats.duration_ns;
 
@@ -1079,6 +1081,8 @@ InferenceProfiler::MergePerfStatusReports(
       (experiment_perf_status.client_stats.request_count *
        experiment_perf_status.batch_size) /
       client_duration_sec;
+  experiment_perf_status.client_stats.responses_per_sec =
+      experiment_perf_status.client_stats.response_count / client_duration_sec;
   RETURN_IF_ERROR(SummarizeLatency(
       experiment_perf_status.client_stats.latencies, experiment_perf_status));
 
@@ -1211,18 +1215,20 @@ InferenceProfiler::Summarize(
 {
   size_t valid_sequence_count = 0;
   size_t delayed_request_count = 0;
+  size_t response_count = 0;
 
   // Get measurement from requests that fall within the time interval
   std::pair<uint64_t, uint64_t> valid_range{window_start_ns, window_end_ns};
   uint64_t window_duration_ns = valid_range.second - valid_range.first;
   std::vector<uint64_t> latencies;
   ValidLatencyMeasurement(
-      valid_range, valid_sequence_count, delayed_request_count, &latencies);
+      valid_range, valid_sequence_count, delayed_request_count, &latencies,
+      response_count);
 
   RETURN_IF_ERROR(SummarizeLatency(latencies, summary));
   RETURN_IF_ERROR(SummarizeClientStat(
       start_stat, end_stat, window_duration_ns, latencies.size(),
-      valid_sequence_count, delayed_request_count, summary));
+      valid_sequence_count, delayed_request_count, response_count, summary));
   summary.client_stats.latencies = std::move(latencies);
 
   SummarizeOverhead(window_duration_ns, manager_->GetIdleTime(), summary);
@@ -1245,10 +1251,11 @@ void
 InferenceProfiler::ValidLatencyMeasurement(
     const std::pair<uint64_t, uint64_t>& valid_range,
     size_t& valid_sequence_count, size_t& delayed_request_count,
-    std::vector<uint64_t>* valid_latencies)
+    std::vector<uint64_t>* valid_latencies, size_t& response_count)
 {
   valid_latencies->clear();
   valid_sequence_count = 0;
+  response_count = 0;
   std::vector<size_t> erase_indices{};
   for (size_t i = 0; i < all_timestamps_.size(); i++) {
     const auto& timestamp = all_timestamps_[i];
@@ -1260,6 +1267,7 @@ InferenceProfiler::ValidLatencyMeasurement(
       if ((request_end_ns >= valid_range.first) &&
           (request_end_ns <= valid_range.second)) {
         valid_latencies->push_back(request_end_ns - request_start_ns);
+        response_count += std::get<1>(timestamp).size();
         erase_indices.push_back(i);
         // Just add the sequence_end flag here.
         if (std::get<2>(timestamp)) {
@@ -1358,7 +1366,7 @@ InferenceProfiler::SummarizeClientStat(
     const cb::InferStat& start_stat, const cb::InferStat& end_stat,
     const uint64_t duration_ns, const size_t valid_request_count,
     const size_t valid_sequence_count, const size_t delayed_request_count,
-    PerfStatus& summary)
+    const size_t response_count, PerfStatus& summary)
 {
   summary.on_sequence_model =
       ((parser_->SchedulerType() == ModelParser::SEQUENCE) ||
@@ -1367,6 +1375,7 @@ InferenceProfiler::SummarizeClientStat(
   summary.client_stats.request_count = valid_request_count;
   summary.client_stats.sequence_count = valid_sequence_count;
   summary.client_stats.delayed_request_count = delayed_request_count;
+  summary.client_stats.response_count = response_count;
   summary.client_stats.duration_ns = duration_ns;
   float client_duration_sec =
       (float)summary.client_stats.duration_ns / NANOS_PER_SECOND;
@@ -1374,6 +1383,7 @@ InferenceProfiler::SummarizeClientStat(
       valid_sequence_count / client_duration_sec;
   summary.client_stats.infer_per_sec =
       (valid_request_count * summary.batch_size) / client_duration_sec;
+  summary.client_stats.responses_per_sec = response_count / client_duration_sec;
 
   if (include_lib_stats_) {
     size_t completed_count =
diff --git a/src/c++/perf_analyzer/inference_profiler.h b/src/c++/perf_analyzer/inference_profiler.h
index 6a0ee625a..b07cd93ae 100644
--- a/src/c++/perf_analyzer/inference_profiler.h
+++ b/src/c++/perf_analyzer/inference_profiler.h
@@ -48,6 +48,7 @@
 namespace triton { namespace perfanalyzer {
 
 #ifndef DOCTEST_CONFIG_DISABLE
+class NaggyMockInferenceProfiler;
 class TestInferenceProfiler;
 #endif
 
@@ -126,6 +127,8 @@ struct ClientSideStats {
   uint64_t sequence_count;
   // The number of requests that missed their schedule
   uint64_t delayed_request_count;
+  // The number of responses
+  uint64_t response_count;
   uint64_t duration_ns;
   uint64_t avg_latency_ns;
   // a ordered map of percentiles to be reported (<percentile, value> pair)
@@ -139,6 +142,7 @@ struct ClientSideStats {
   uint64_t avg_receive_time_ns;
   // Per sec stat
   double infer_per_sec;
+  double responses_per_sec;
   double sequence_per_sec;
 
   // Completed request count reported by the client library
@@ -440,16 +444,17 @@ class InferenceProfiler {
   /// sequence model.
   /// \param latencies Returns the vector of request latencies where the
   /// requests are completed within the measurement window.
-  void ValidLatencyMeasurement(
+  /// \param response_count Returns the number of responses
+  virtual void ValidLatencyMeasurement(
       const std::pair<uint64_t, uint64_t>& valid_range,
       size_t& valid_sequence_count, size_t& delayed_request_count,
-      std::vector<uint64_t>* latencies);
+      std::vector<uint64_t>* latencies, size_t& response_count);
 
   /// \param latencies The vector of request latencies collected.
   /// \param summary Returns the summary that the latency related fields are
   /// set.
   /// \return cb::Error object indicating success or failure.
-  cb::Error SummarizeLatency(
+  virtual cb::Error SummarizeLatency(
       const std::vector<uint64_t>& latencies, PerfStatus& summary);
 
   /// \param latencies The vector of request latencies collected.
@@ -466,14 +471,15 @@ class InferenceProfiler {
   /// \param valid_sequence_count The number of completed sequences recorded.
   /// \param delayed_request_count The number of requests that missed their
   /// schedule.
+  /// \param response_count The number of responses.
   /// \param summary Returns the summary that the fields recorded by
   /// client are set.
   /// \return cb::Error object indicating success or failure.
-  cb::Error SummarizeClientStat(
+  virtual cb::Error SummarizeClientStat(
       const cb::InferStat& start_stat, const cb::InferStat& end_stat,
       const uint64_t duration_ns, const size_t valid_request_count,
       const size_t delayed_request_count, const size_t valid_sequence_count,
-      PerfStatus& summary);
+      const size_t response_count, PerfStatus& summary);
 
   /// Adds the send request rate metric to the summary object.
   /// \param window_duration_s The duration of the window in seconds.
@@ -557,7 +563,7 @@ class InferenceProfiler {
   /// \param perf_status List of perf status reports to be merged.
   /// \param summary_status Final merged summary status.
   /// \return cb::Error object indicating success or failure.
-  cb::Error MergePerfStatusReports(
+  virtual cb::Error MergePerfStatusReports(
       std::deque<PerfStatus>& perf_status, PerfStatus& summary_status);
 
   /// Merge individual server side statistics into a single server side report.
@@ -565,7 +571,7 @@ class InferenceProfiler {
   /// merged.
   /// \param server_side_summary Final merged summary status.
   /// \return cb::Error object indicating success or failure.
-  cb::Error MergeServerSideStats(
+  virtual cb::Error MergeServerSideStats(
       std::vector<ServerSideStats>& server_side_stats,
       ServerSideStats& server_side_summary);
 
@@ -695,10 +701,11 @@ class InferenceProfiler {
   const double overhead_pct_threshold_{0.0};
 
 #ifndef DOCTEST_CONFIG_DISABLE
+  friend NaggyMockInferenceProfiler;
   friend TestInferenceProfiler;
 
  public:
-  InferenceProfiler(){};
+  InferenceProfiler() = default;
 #endif
 };
 
diff --git a/src/c++/perf_analyzer/load_manager.h b/src/c++/perf_analyzer/load_manager.h
index ebcad5192..5a10ae592 100644
--- a/src/c++/perf_analyzer/load_manager.h
+++ b/src/c++/perf_analyzer/load_manager.h
@@ -40,6 +40,11 @@
 
 namespace triton { namespace perfanalyzer {
 
+
+#ifndef DOCTEST_CONFIG_DISABLE
+class NaggyMockLoadManager;
+#endif
+
 class LoadManager {
  public:
   virtual ~LoadManager() = default;
@@ -97,7 +102,7 @@ class LoadManager {
   const size_t GetAndResetNumSentRequests();
 
   /// \return the batch size used for the inference requests
-  size_t BatchSize() const { return batch_size_; }
+  virtual size_t BatchSize() const { return batch_size_; }
 
   /// Count the number of requests collected until now.
   uint64_t CountCollectedRequests();
@@ -165,6 +170,8 @@ class LoadManager {
       std::shared_ptr<DataLoader> data_loader);
 
 #ifndef DOCTEST_CONFIG_DISABLE
+  friend NaggyMockLoadManager;
+
  public:
   LoadManager() = default;
 #endif
diff --git a/src/c++/perf_analyzer/mock_inference_profiler.h b/src/c++/perf_analyzer/mock_inference_profiler.h
index b44d94959..a31485091 100644
--- a/src/c++/perf_analyzer/mock_inference_profiler.h
+++ b/src/c++/perf_analyzer/mock_inference_profiler.h
@@ -1,4 +1,4 @@
-// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -30,9 +30,92 @@
 
 namespace triton { namespace perfanalyzer {
 
-class MockInferenceProfiler : public InferenceProfiler {
+class NaggyMockInferenceProfiler : public InferenceProfiler {
  public:
-  MockInferenceProfiler() = default;
+  NaggyMockInferenceProfiler()
+  {
+    ON_CALL(
+        *this, ValidLatencyMeasurement(
+                   testing::_, testing::_, testing::_, testing::_, testing::_))
+        .WillByDefault(
+            [this](
+                const std::pair<uint64_t, uint64_t>& valid_range,
+                size_t& valid_sequence_count, size_t& delayed_request_count,
+                std::vector<uint64_t>* latencies,
+                size_t& response_count) -> void {
+              this->InferenceProfiler::ValidLatencyMeasurement(
+                  valid_range, valid_sequence_count, delayed_request_count,
+                  latencies, response_count);
+            });
+    ON_CALL(*this, SummarizeLatency(testing::_, testing::_))
+        .WillByDefault(
+            [this](
+                const std::vector<uint64_t>& latencies,
+                PerfStatus& summary) -> cb::Error {
+              return this->InferenceProfiler::SummarizeLatency(
+                  latencies, summary);
+            });
+    ON_CALL(*this, MergePerfStatusReports(testing::_, testing::_))
+        .WillByDefault(
+            [this](
+                std::deque<PerfStatus>& perf_status,
+                PerfStatus& summary_status) -> cb::Error {
+              return this->InferenceProfiler::MergePerfStatusReports(
+                  perf_status, summary_status);
+            });
+    ON_CALL(*this, MergeServerSideStats(testing::_, testing::_))
+        .WillByDefault(
+            [this](
+                std::vector<ServerSideStats>& server_side_stats,
+                ServerSideStats& server_side_summary) -> cb::Error {
+              return this->InferenceProfiler::MergeServerSideStats(
+                  server_side_stats, server_side_summary);
+            });
+    ON_CALL(
+        *this, SummarizeClientStat(
+                   testing::_, testing::_, testing::_, testing::_, testing::_,
+                   testing::_, testing::_, testing::_))
+        .WillByDefault(
+            [this](
+                const cb::InferStat& start_stat, const cb::InferStat& end_stat,
+                const uint64_t duration_ns, const size_t valid_request_count,
+                const size_t delayed_request_count,
+                const size_t valid_sequence_count, const size_t response_count,
+                PerfStatus& summary) -> cb::Error {
+              return this->InferenceProfiler::SummarizeClientStat(
+                  start_stat, end_stat, duration_ns, valid_request_count,
+                  delayed_request_count, valid_sequence_count, response_count,
+                  summary);
+            });
+  };
+
   MOCK_METHOD0(IncludeServerStats, bool());
+  MOCK_METHOD(
+      void, ValidLatencyMeasurement,
+      ((const std::pair<uint64_t, uint64_t>&), size_t&, size_t&,
+       std::vector<uint64_t>*, size_t&),
+      (override));
+  MOCK_METHOD(
+      cb::Error, SummarizeLatency, (const std::vector<uint64_t>&, PerfStatus&),
+      (override));
+  MOCK_METHOD(
+      cb::Error, MergePerfStatusReports, (std::deque<PerfStatus>&, PerfStatus&),
+      (override));
+  MOCK_METHOD(
+      cb::Error, MergeServerSideStats,
+      (std::vector<ServerSideStats>&, ServerSideStats&), (override));
+  MOCK_METHOD(
+      cb::Error, SummarizeClientStat,
+      (const cb::InferStat&, const cb::InferStat&, const uint64_t, const size_t,
+       const size_t, const size_t, const size_t, PerfStatus&),
+      (override));
+
+  std::shared_ptr<ModelParser>& parser_{InferenceProfiler::parser_};
+  std::unique_ptr<LoadManager>& manager_{InferenceProfiler::manager_};
+  bool& include_lib_stats_{InferenceProfiler::include_lib_stats_};
+  TimestampVector& all_timestamps_{InferenceProfiler::all_timestamps_};
 };
+
+using MockInferenceProfiler = testing::NiceMock<NaggyMockInferenceProfiler>;
+
 }}  // namespace triton::perfanalyzer
diff --git a/src/c++/perf_analyzer/mock_load_manager.h b/src/c++/perf_analyzer/mock_load_manager.h
new file mode 100644
index 000000000..2088a4053
--- /dev/null
+++ b/src/c++/perf_analyzer/mock_load_manager.h
@@ -0,0 +1,37 @@
+// Copyright 2023 (c), NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "gmock/gmock.h"
+#include "load_manager.h"
+
+namespace triton { namespace perfanalyzer {
+
+class NaggyMockLoadManager : public LoadManager {};
+
+using MockLoadManager = testing::NiceMock<NaggyMockLoadManager>;
+
+}}  // namespace triton::perfanalyzer
diff --git a/src/c++/perf_analyzer/perf_utils.h b/src/c++/perf_analyzer/perf_utils.h
index f11bf9815..1865b8430 100644
--- a/src/c++/perf_analyzer/perf_utils.h
+++ b/src/c++/perf_analyzer/perf_utils.h
@@ -55,7 +55,8 @@ constexpr uint64_t NANOS_PER_MILLIS = 1000000;
 //==============================================================================
 using TimestampVector = std::vector<std::tuple<
     std::chrono::time_point<std::chrono::system_clock>,
-    std::vector<std::chrono::time_point<std::chrono::system_clock>>, uint32_t, bool>>;
+    std::vector<std::chrono::time_point<std::chrono::system_clock>>, uint32_t,
+    bool>>;
 
 // Will use the characters specified here to construct random strings
 std::string const character_set =
diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc
index 27f75519f..71ed2eb89 100644
--- a/src/c++/perf_analyzer/test_inference_profiler.cc
+++ b/src/c++/perf_analyzer/test_inference_profiler.cc
@@ -27,6 +27,8 @@
 #include "doctest.h"
 #include "inference_profiler.h"
 #include "mock_inference_profiler.h"
+#include "mock_load_manager.h"
+#include "mock_model_parser.h"
 
 namespace triton { namespace perfanalyzer {
 
@@ -35,12 +37,14 @@ class TestInferenceProfiler : public InferenceProfiler {
   static void ValidLatencyMeasurement(
       const std::pair<uint64_t, uint64_t>& valid_range,
       size_t& valid_sequence_count, size_t& delayed_request_count,
-      std::vector<uint64_t>* latencies, TimestampVector& all_timestamps)
+      std::vector<uint64_t>* latencies, size_t& response_count,
+      TimestampVector& all_timestamps)
   {
     InferenceProfiler inference_profiler{};
     inference_profiler.all_timestamps_ = all_timestamps;
     inference_profiler.ValidLatencyMeasurement(
-        valid_range, valid_sequence_count, delayed_request_count, latencies);
+        valid_range, valid_sequence_count, delayed_request_count, latencies,
+        response_count);
   }
 
   static std::tuple<uint64_t, uint64_t> GetMeanAndStdDev(
@@ -162,6 +166,7 @@ TEST_CASE("testing the ValidLatencyMeasurement function")
   size_t valid_sequence_count{};
   size_t delayed_request_count{};
   std::vector<uint64_t> latencies{};
+  size_t response_count{};
 
   const std::pair<uint64_t, uint64_t> window{4, 17};
   using time_point = std::chrono::time_point<std::chrono::system_clock>;
@@ -201,7 +206,7 @@ TEST_CASE("testing the ValidLatencyMeasurement function")
 
   TestInferenceProfiler::ValidLatencyMeasurement(
       window, valid_sequence_count, delayed_request_count, &latencies,
-      all_timestamps);
+      response_count, all_timestamps);
 
   const auto& convert_timestamp_to_latency{
       [](std::tuple<time_point, std::vector<time_point>, uint32_t, bool> t) {
@@ -854,4 +859,115 @@ TEST_CASE("determine_stats_model_version: testing DetermineStatsModelVersion()")
   std::cerr.rdbuf(old);
 }
 
+TEST_CASE(
+    "valid_latency_measurement: testing the ValidLatencyMeasurement function")
+{
+  MockInferenceProfiler mock_inference_profiler{};
+
+  SUBCASE("testing logic relevant to response throughput metric")
+  {
+    auto clock_epoch{std::chrono::time_point<std::chrono::system_clock>()};
+
+    auto request1_timestamp{clock_epoch + std::chrono::nanoseconds(1)};
+    auto response1_timestamp{clock_epoch + std::chrono::nanoseconds(2)};
+    auto response2_timestamp{clock_epoch + std::chrono::nanoseconds(3)};
+    auto timestamp1{std::make_tuple(
+        request1_timestamp,
+        std::vector<std::chrono::time_point<std::chrono::system_clock>>{
+            response1_timestamp, response2_timestamp},
+        0, false)};
+
+    auto request2_timestamp{clock_epoch + std::chrono::nanoseconds(4)};
+    auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(5)};
+    auto response4_timestamp{clock_epoch + std::chrono::nanoseconds(6)};
+    auto response5_timestamp{clock_epoch + std::chrono::nanoseconds(7)};
+    auto timestamp2{std::make_tuple(
+        request2_timestamp,
+        std::vector<std::chrono::time_point<std::chrono::system_clock>>{
+            response3_timestamp, response4_timestamp, response5_timestamp},
+        0, false)};
+
+    mock_inference_profiler.all_timestamps_ = {timestamp1, timestamp2};
+
+    const std::pair<uint64_t, uint64_t> valid_range{
+        std::make_pair(0, UINT64_MAX)};
+    size_t valid_sequence_count{0};
+    size_t delayed_request_count{0};
+    std::vector<uint64_t> valid_latencies{};
+    size_t response_count{0};
+
+    mock_inference_profiler.ValidLatencyMeasurement(
+        valid_range, valid_sequence_count, delayed_request_count,
+        &valid_latencies, response_count);
+
+    CHECK(response_count == 5);
+  }
+}
+
+TEST_CASE(
+    "merge_perf_status_reports: testing the MergePerfStatusReports function")
+{
+  MockInferenceProfiler mock_inference_profiler{};
+
+  SUBCASE("testing logic relevant to response throughput metric")
+  {
+    PerfStatus perf_status1{};
+    perf_status1.client_stats.response_count = 8;
+    perf_status1.client_stats.duration_ns = 2000000000;
+
+    PerfStatus perf_status2{};
+    perf_status2.client_stats.response_count = 10;
+    perf_status2.client_stats.duration_ns = 4000000000;
+
+    std::deque<PerfStatus> perf_status{perf_status1, perf_status2};
+    PerfStatus summary_status{};
+
+    cb::Error error{};
+
+    EXPECT_CALL(
+        mock_inference_profiler, MergeServerSideStats(testing::_, testing::_))
+        .WillOnce(testing::Return(cb::Error::Success));
+    EXPECT_CALL(
+        mock_inference_profiler, SummarizeLatency(testing::_, testing::_))
+        .WillOnce(testing::Return(cb::Error::Success));
+
+    error = mock_inference_profiler.MergePerfStatusReports(
+        perf_status, summary_status);
+
+    REQUIRE(error.IsOk() == true);
+    CHECK(summary_status.client_stats.response_count == 18);
+    CHECK(
+        summary_status.client_stats.responses_per_sec == doctest::Approx(3.0));
+  }
+}
+
+TEST_CASE("summarize_client_stat: testing the SummarizeClientStat function")
+{
+  MockInferenceProfiler mock_inference_profiler{};
+
+  SUBCASE("testing logic relevant to response throughput metric")
+  {
+    mock_inference_profiler.parser_ = std::make_shared<MockModelParser>();
+    mock_inference_profiler.manager_ = std::make_unique<MockLoadManager>();
+
+    const cb::InferStat start_stat{};
+    const cb::InferStat end_stat{};
+    const uint64_t duration_ns{2000000000};
+    const size_t valid_request_count{0};
+    const size_t delayed_request_count{0};
+    const size_t valid_sequence_count{0};
+    const size_t response_count{8};
+    PerfStatus summary{};
+
+    cb::Error error{};
+
+    error = mock_inference_profiler.SummarizeClientStat(
+        start_stat, end_stat, duration_ns, valid_request_count,
+        delayed_request_count, valid_sequence_count, response_count, summary);
+
+    REQUIRE(error.IsOk() == true);
+    CHECK(summary.client_stats.response_count == 8);
+    CHECK(summary.client_stats.responses_per_sec == doctest::Approx(4.0));
+  }
+}
 }}  // namespace triton::perfanalyzer