From 5dfebe1c486d86fed0da0b030589306385539bb3 Mon Sep 17 00:00:00 2001 From: Izzy Putterman Date: Thu, 7 Nov 2024 11:06:03 -0800 Subject: [PATCH] PA: Fixed scheduler manager (#88) * Do I know what I am doing? * compiles apparently * Move to request_rate * Fix small bugs * more small fixes * Maybe fix small issue * Actually initialize empty * Convert to const * Drop old files * Force request-count to equal schedule length * Copy from GenAI branch * Deformat * Reorganize arg order Fix pre-commit errors * Fix errors, modify design * fix pre-commit Fix comments --------- Co-authored-by: lkomali Co-authored-by: Harshini Komali <157742537+lkomali@users.noreply.github.com> --- src/CMakeLists.txt | 2 + src/command_line_parser.cc | 29 +++++- src/command_line_parser.h | 2 + src/custom_request_schedule_manager.cc | 121 +++++++++++++++++++++++++ src/custom_request_schedule_manager.h | 109 ++++++++++++++++++++++ src/inference_profiler.cc | 5 +- src/inference_profiler.h | 6 +- src/perf_analyzer.cc | 31 ++++--- src/request_rate_manager.cc | 6 +- src/request_rate_manager.h | 9 +- 10 files changed, 292 insertions(+), 28 deletions(-) create mode 100644 src/custom_request_schedule_manager.cc create mode 100644 src/custom_request_schedule_manager.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index caaae35c..1d51c164 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -130,6 +130,7 @@ set( data_loader.cc concurrency_manager.cc request_rate_manager.cc + custom_request_schedule_manager.cc load_worker.cc concurrency_worker.cc request_rate_worker.cc @@ -160,6 +161,7 @@ set( data_loader.h concurrency_manager.h request_rate_manager.h + custom_request_schedule_manager.h custom_load_manager.h iworker.h load_worker.h diff --git a/src/command_line_parser.cc b/src/command_line_parser.cc index 0d07c42a..e0de94ff 100644 --- a/src/command_line_parser.cc +++ b/src/command_line_parser.cc @@ -53,7 +53,7 @@ SplitString(const std::string& str, const std::string& delimiter = ":") std::vector substrs; size_t pos = 0; while (pos != std::string::npos) { - size_t colon_pos = str.find(":", pos); + size_t colon_pos = str.find(delimiter, pos); substrs.push_back(str.substr(pos, colon_pos - pos)); if (colon_pos == std::string::npos) { pos = colon_pos; @@ -908,6 +908,7 @@ CLParser::ParseCommandLine(int argc, char** argv) {"endpoint", required_argument, 0, long_option_idx_base + 61}, {"request-count", required_argument, 0, long_option_idx_base + 62}, {"warmup-request-count", required_argument, 0, long_option_idx_base + 63}, + {"schedule", required_argument, 0, long_option_idx_base + 64}, {0, 0, 0, 0}}; // Parse commandline... @@ -1647,7 +1648,9 @@ CLParser::ParseCommandLine(int argc, char** argv) if (std::stoi(optarg) < 0) { Usage("Failed to parse --request-count. The value must be > 0."); } - params_->request_count = std::stoi(optarg); + if (params_->request_count == 0) { + params_->request_count = std::stoi(optarg); + } break; } case long_option_idx_base + 63: { @@ -1659,6 +1662,17 @@ CLParser::ParseCommandLine(int argc, char** argv) params_->warmup_request_count = std::stoi(optarg); break; } + case long_option_idx_base + 64: { + std::vector schedule; + std::string arg = optarg; + std::vector float_strings = SplitString(optarg, ","); + for (const std::string& str : float_strings) { + schedule.push_back(std::stof(str)); + } + params_->schedule = schedule; + params_->request_count = schedule.size(); + break; + } case 'v': params_->extra_verbose = params_->verbose; params_->verbose = true; @@ -1977,9 +1991,13 @@ CLParser::VerifyOptions() Usage( "perf_analyzer supports only grpc protocol for TensorFlow Serving."); } else if (params_->streaming) { - Usage("perf_analyzer does not support streaming for TensorFlow Serving."); + Usage( + "perf_analyzer does not support streaming for TensorFlow " + "Serving."); } else if (params_->async) { - Usage("perf_analyzer does not support async API for TensorFlow Serving."); + Usage( + "perf_analyzer does not support async API for TensorFlow " + "Serving."); } else if (!params_->using_batch_size) { params_->batch_size = 0; } @@ -2008,7 +2026,8 @@ CLParser::VerifyOptions() if (params_->async && params_->streaming && params_->shared_memory_type != SharedMemoryType::NO_SHARED_MEMORY) { Usage( - "Cannot use --shared-memory=system or --shared-memory=cuda with " + "Cannot use --shared-memory=system or --shared-memory=cuda " + "with " "--service-kind=triton_c_api and --async and --streaming."); } diff --git a/src/command_line_parser.h b/src/command_line_parser.h index 7c8f4977..e0d86cc2 100644 --- a/src/command_line_parser.h +++ b/src/command_line_parser.h @@ -157,6 +157,8 @@ struct PerfAnalyzerParameters { Range periodic_concurrency_range{1, 1, 1}; uint64_t request_period{10}; size_t warmup_request_count{0}; + + std::vector schedule{}; }; using PAParamsPtr = std::shared_ptr; diff --git a/src/custom_request_schedule_manager.cc b/src/custom_request_schedule_manager.cc new file mode 100644 index 00000000..2ebf9538 --- /dev/null +++ b/src/custom_request_schedule_manager.cc @@ -0,0 +1,121 @@ +// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "custom_request_schedule_manager.h" + +namespace triton::perfanalyzer { + +cb::Error +CustomRequestScheduleManager::Create( + const pa::PAParamsPtr& params, const std::shared_ptr& parser, + const std::shared_ptr& factory, + std::unique_ptr* manager) +{ + std::unique_ptr local_manager( + new CustomRequestScheduleManager(params, parser, factory)); + + *manager = std::move(local_manager); + + return cb::Error::Success; +} + +CustomRequestScheduleManager::CustomRequestScheduleManager( + const pa::PAParamsPtr& params, const std::shared_ptr& parser, + const std::shared_ptr& factory) + : RequestRateManager( + params->async, params->streaming, Distribution::CUSTOM, + params->batch_size, params->measurement_window_ms, params->max_trials, + params->max_threads, params->num_of_sequences, + params->shared_memory_type, params->output_shm_size, + params->serial_sequences, parser, factory, + params->request_parameters), + schedule_(params->schedule) +{ +} + +cb::Error +CustomRequestScheduleManager::PerformWarmup( + double request_rate, size_t warmup_request_count) +{ + if (warmup_request_count == 0) { + return cb::Error::Success; + } + RETURN_IF_ERROR(ChangeRequestRate(request_rate, warmup_request_count)); + WaitForWarmupAndCleanup(); + return cb::Error::Success; +} + +cb::Error +CustomRequestScheduleManager::ChangeRequestRate( + const double request_rate, const size_t request_count) +{ + PauseWorkers(); + ConfigureThreads(request_count); + GenerateSchedule(request_rate, schedule_); + ResumeWorkers(); + + return cb::Error::Success; +} + +void +CustomRequestScheduleManager::GenerateSchedule( + const double request_rate, const std::vector& schedule) +{ + std::vector scaled_schedule; + scaled_schedule.reserve(schedule.size()); + if (schedule.size() > 0) { + for (const auto& value : schedule) { + scaled_schedule.push_back(value / static_cast(request_rate)); + } + } + auto worker_schedules = CreateWorkerSchedules(schedule); + GiveSchedulesToWorkers(worker_schedules); +} + +std::vector +CustomRequestScheduleManager::CreateWorkerSchedules( + const std::vector& schedule) +{ + std::vector worker_schedules = + CreateEmptyWorkerSchedules(); + std::vector thread_ids{CalculateThreadIds()}; + std::chrono::nanoseconds next_timestamp(0); + size_t thread_id_index = 0; + size_t worker_index = 0; + + for (const float& val : schedule) { + next_timestamp = std::chrono::duration_cast( + std::chrono::duration(val)); + worker_index = thread_ids[thread_id_index]; + thread_id_index = ++thread_id_index % thread_ids.size(); + worker_schedules[worker_index]->intervals.emplace_back(next_timestamp); + } + SetScheduleDurations(worker_schedules); + + return worker_schedules; +} + +} // namespace triton::perfanalyzer diff --git a/src/custom_request_schedule_manager.h b/src/custom_request_schedule_manager.h new file mode 100644 index 00000000..38d25c82 --- /dev/null +++ b/src/custom_request_schedule_manager.h @@ -0,0 +1,109 @@ +// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "command_line_parser.h" +#include "load_manager.h" +#include "request_rate_manager.h" + +namespace triton::perfanalyzer { + +//============================================================================== +/// CustomRequestScheduleManager is a helper class to send inference requests to +/// inference server in accordance with the schedule set by the user. +/// +/// Detail: +/// An instance of this load manager will be created at the beginning of the +/// perf analyzer and it will be used to schedule to send requests at that +/// particular second defined by the user. The particular seconds at which a +/// request should be sent can be set by the user using the `schedule` option. +/// For example, if the `schedule` is set to 1,2,4,5,6.5, +/// CustomRequestScheduleManager sends request at 1st second, 2nd second, 4th +/// second and so on. +/// + +class CustomRequestScheduleManager : public RequestRateManager { + public: + ~CustomRequestScheduleManager() = default; + + /// Creates an object of CustomRequestScheduleManager + /// \param params A PAParamsPtr (std::shared_ptr) that + /// holds configuration parameters to create CustomRequestScheduleManager + /// object + /// + static cb::Error Create( + const pa::PAParamsPtr& params, const std::shared_ptr& parser, + const std::shared_ptr& factory, + std::unique_ptr* manager); + + /// Performs warmup for benchmarking by sending a fixed number of requests + /// according to the specified request rate + /// \param request_rate The rate at which requests must be issued to the + /// server \param warmup_request_count The number of warmup requests to send + /// \return cb::Error object indicating success or failure + cb::Error PerformWarmup( + double request_rate, size_t warmup_request_count) override; + + /// Adjusts the rate of issuing requests to be the same as 'request_rate' + /// \param request_rate The rate at which requests must be issued to the + /// server \param request_count The number of requests to generate when + /// profiling \return cb::Error object indicating success or failure + cb::Error ChangeRequestRate( + const double request_rate, const size_t request_count) override; + + + protected: + /// Constructor for CustomRequestScheduleManager + /// + /// Initializes a CustomRequestScheduleManager instance using a PAParamsPtr + /// object that contains all necessary parameters for request scheduling. + /// + /// \param params A PAParamsPtr (std::shared_ptr) that + /// holds configuration parameters to create CustomRequestScheduleManager + /// object + /// + CustomRequestScheduleManager( + const pa::PAParamsPtr& params, const std::shared_ptr& parser, + const std::shared_ptr& factory); + + /// Generates and updates the request schedule as per the given request rate + /// and schedule \param request_rate The request rate to use for new schedule + /// \param schedule The vector containing the schedule for requests + void GenerateSchedule( + const double request_rate, const std::vector& schedule); + + /// Creates worker schedules based on the provided schedule + /// \param duration The maximum duration for the schedule + /// \param schedule The vector containing the schedule for requests + /// \return A vector of RateSchedulePtr_t representing the worker schedules + std::vector CreateWorkerSchedules( + const std::vector& schedule); + + /// The vector containing the schedule for requests + std::vector schedule_; +}; + +} // namespace triton::perfanalyzer diff --git a/src/inference_profiler.cc b/src/inference_profiler.cc index 9f425e36..bb51e201 100644 --- a/src/inference_profiler.cc +++ b/src/inference_profiler.cc @@ -40,7 +40,7 @@ #include "constants.h" #include "doctest.h" -namespace triton { namespace perfanalyzer { +namespace triton::perfanalyzer { cb::Error ReportPrometheusMetrics(const Metrics& metrics) { @@ -622,6 +622,7 @@ InferenceProfiler::Profile( is_stable = false; meets_threshold = true; + RETURN_IF_ERROR(dynamic_cast(manager_.get()) ->PerformWarmup(request_rate, warmup_request_count)); RETURN_IF_ERROR(dynamic_cast(manager_.get()) @@ -1872,4 +1873,4 @@ InferenceProfiler::MergeMetrics( return cb::Error::Success; } -}} // namespace triton::perfanalyzer +} // namespace triton::perfanalyzer diff --git a/src/inference_profiler.h b/src/inference_profiler.h index 206a8449..d9c9e595 100644 --- a/src/inference_profiler.h +++ b/src/inference_profiler.h @@ -39,6 +39,7 @@ #include "concurrency_manager.h" #include "constants.h" #include "custom_load_manager.h" +#include "custom_request_schedule_manager.h" #include "metrics.h" #include "metrics_manager.h" #include "model_parser.h" @@ -47,7 +48,7 @@ #include "profile_data_collector.h" #include "request_rate_manager.h" -namespace triton { namespace perfanalyzer { +namespace triton::perfanalyzer { #ifndef DOCTEST_CONFIG_DISABLE class NaggyMockInferenceProfiler; @@ -443,6 +444,7 @@ class InferenceProfiler { std::vector& perf_statuses, bool& meets_threshold, bool& is_stable); + /// A helper function for profiling functions. /// \param status_summary Returns the summary of the measurement. /// \param request_count The number of requests to generate when profiling. If @@ -829,4 +831,4 @@ class InferenceProfiler { #endif }; -}} // namespace triton::perfanalyzer +} // namespace triton::perfanalyzer diff --git a/src/perf_analyzer.cc b/src/perf_analyzer.cc index fd3e8ecc..bd1015b2 100644 --- a/src/perf_analyzer.cc +++ b/src/perf_analyzer.cc @@ -26,6 +26,7 @@ #include "perf_analyzer.h" +#include "custom_request_schedule_manager.h" #include "perf_analyzer_exception.h" #include "periodic_concurrency_manager.h" #include "report_writer.h" @@ -33,7 +34,7 @@ namespace pa = triton::perfanalyzer; -namespace triton { namespace perfanalyzer { +namespace triton::perfanalyzer { volatile bool early_exit = false; @@ -52,7 +53,7 @@ SignalHandler(int signum) exit(0); } } -}} // namespace triton::perfanalyzer +} // namespace triton::perfanalyzer PerfAnalyzer::PerfAnalyzer(pa::PAParamsPtr params) : params_(params) { @@ -238,15 +239,23 @@ PerfAnalyzer::CreateAnalyzerObjects() << "may occur." << std::endl; throw pa::PerfAnalyzerException(pa::GENERIC_ERROR); } - FAIL_IF_ERR( - pa::RequestRateManager::Create( - params_->async, params_->streaming, params_->measurement_window_ms, - params_->max_trials, params_->request_distribution, - params_->batch_size, params_->max_threads, - params_->num_of_sequences, params_->shared_memory_type, - params_->output_shm_size, params_->serial_sequences, parser_, - factory, &manager, params_->request_parameters), - "failed to create request rate manager"); + if (!params_->schedule.empty()) { + FAIL_IF_ERR( + pa::CustomRequestScheduleManager::Create( + params_, parser_, factory, &manager), + "failed to create custom request schedule manager"); + } else { + FAIL_IF_ERR( + pa::RequestRateManager::Create( + params_->async, params_->streaming, + params_->measurement_window_ms, params_->max_trials, + params_->request_distribution, params_->batch_size, + params_->max_threads, params_->num_of_sequences, + params_->shared_memory_type, params_->output_shm_size, + params_->serial_sequences, parser_, factory, &manager, + params_->request_parameters), + "failed to create request rate manager"); + } } else { if ((params_->sequence_id_range != 0) && diff --git a/src/request_rate_manager.cc b/src/request_rate_manager.cc index 454b11d9..118c4312 100644 --- a/src/request_rate_manager.cc +++ b/src/request_rate_manager.cc @@ -26,7 +26,7 @@ #include "request_rate_manager.h" -namespace triton { namespace perfanalyzer { +namespace triton::perfanalyzer { RequestRateManager::~RequestRateManager() { @@ -106,7 +106,6 @@ RequestRateManager::ChangeRequestRate( { PauseWorkers(); ConfigureThreads(request_count); - // Can safely update the schedule GenerateSchedule(request_rate); ResumeWorkers(); @@ -153,7 +152,6 @@ RequestRateManager::CreateWorkerSchedules( size_t thread_id_index = 0; size_t worker_index = 0; - // Generate schedule until we hit max_duration, but also make sure that all // worker schedules follow the thread id distribution // @@ -314,4 +312,4 @@ RequestRateManager::DetermineNumThreads() } -}} // namespace triton::perfanalyzer +} // namespace triton::perfanalyzer diff --git a/src/request_rate_manager.h b/src/request_rate_manager.h index 6d5904a3..b8bba685 100644 --- a/src/request_rate_manager.h +++ b/src/request_rate_manager.h @@ -30,7 +30,7 @@ #include "load_manager.h" #include "request_rate_worker.h" -namespace triton { namespace perfanalyzer { +namespace triton::perfanalyzer { #ifndef DOCTEST_CONFIG_DISABLE class TestRequestRateManager; @@ -103,7 +103,7 @@ class RequestRateManager : public LoadManager { /// \param target_request_rate The rate at which requests must be issued to /// the server. /// \param warmup_request_count The number of warmup requests to send. - cb::Error PerformWarmup( + virtual cb::Error PerformWarmup( double target_request_rate, size_t warmup_request_count); /// Adjusts the rate of issuing requests to be the same as 'request_rate' @@ -112,9 +112,10 @@ class RequestRateManager : public LoadManager { /// \param request_count The number of requests to generate when profiling. If /// 0, then there is no limit, and it will generate until told to stop. /// \return cb::Error object indicating success or failure. - cb::Error ChangeRequestRate( + virtual cb::Error ChangeRequestRate( const double target_request_rate, const size_t request_count = 0); + protected: RequestRateManager( const bool async, const bool streaming, Distribution request_distribution, @@ -175,4 +176,4 @@ class RequestRateManager : public LoadManager { #endif }; -}} // namespace triton::perfanalyzer +} // namespace triton::perfanalyzer