Skip to content

Commit

Permalink
Add new command line options to enable periodic concurrency mode (#391)
Browse files Browse the repository at this point in the history
* Add macros to reuse it for checking range options

* Add tests for periodic-concurrency-range option

* Add periodic-concurrency-range and request-period options

* Add doc for periodic-concurrency-range and request-period

* Add test for request-period option

* Revert macro and add reusable test function

* Add more tests

* Small refactor

* Refactor a subcase

* Require bi-directional gRPC streaming for periodic concurrency mode

* Address feedback

* Refine the error message

* Add bi-directional gRPC streaming options for periodic concurrency mode

* Add request-parameter option and refactor

* Refactor

* Add valid case for request-parameter option

* Add --request-parameter doc and edit periodic concurrency description

* Custom request parameter is currently only supported by gRPC

* Parse and store the type of request parameter

* Add checks between act vs. exp

* Remove uint type and rebase

* Change doc

* Minor fix

* Address feedback
  • Loading branch information
nv-hwoo authored Sep 27, 2023
1 parent 2138a85 commit 2b522e6
Show file tree
Hide file tree
Showing 6 changed files with 641 additions and 149 deletions.
227 changes: 195 additions & 32 deletions src/c++/perf_analyzer/command_line_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,31 @@ CLParser::Parse(int argc, char** argv)
return params_;
}

std::vector<std::string>
SplitString(const std::string& str, const std::string& delimiter = ":")
{
std::vector<std::string> substrs;
size_t pos = 0;
while (pos != std::string::npos) {
size_t colon_pos = str.find(":", pos);
substrs.push_back(str.substr(pos, colon_pos - pos));
if (colon_pos == std::string::npos) {
pos = colon_pos;
} else {
pos = colon_pos + 1;
}
}
return substrs;
}

void
ToLowerCase(std::string& s)
{
std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) {
return std::tolower(c);
});
}

// Used to format the usage message
std::string
CLParser::FormatMessage(std::string str, int offset) const
Expand Down Expand Up @@ -88,6 +113,8 @@ CLParser::Usage(const std::string& msg)
std::cerr << "\t--measurement-interval (-p) <measurement window (in msec)>"
<< std::endl;
std::cerr << "\t--concurrency-range <start:end:step>" << std::endl;
std::cerr << "\t--periodic-concurrency-range <start:end:step>" << std::endl;
std::cerr << "\t--request-period <number of responses>" << std::endl;
std::cerr << "\t--request-rate-range <start:end:step>" << std::endl;
std::cerr << "\t--request-distribution <\"poisson\"|\"constant\">"
<< std::endl;
Expand Down Expand Up @@ -274,6 +301,45 @@ CLParser::Usage(const std::string& msg)
"not be 0 for sequence models while using asynchronous mode.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
"--periodic-concurrency-range <start:end:step>: Determines the "
"range of concurrency levels in the similar but slightly "
"different manner as the --concurrency-range. Perf Analyzer will "
"start from the concurrency level of 'start' and increase by "
"'step' each time. Unlike --concurrency-range, the 'end' "
"indicates the *total* number of concurrency since the 'start' "
"(including) and will stop increasing once the cumulative number "
"of concurrent requests has reached the 'end'. The user can "
"specify *when* to periodically increase the concurrency level "
"using the --request-period option. The concurrency level will "
"periodically increase for every n-th response specified by "
"--request-period. Since this disables stability check in Perf "
"Analyzer and reports response timestamps only, the user must "
"provide --profile-export-file to specify where to dump all the "
"measured timestamps. The default values of 'start', 'end', and "
"'step' are 1.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
"--request-period <n>: Indicates the number of responses that "
"each request must receive before new, concurrent requests are "
"sent when --periodic-concurrency-range is specified. Default "
"value is 10.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
"--request-parameter <name:value:type>: Specifies a custom "
"parameter that can be sent to a Triton backend as part of the "
"request. For example, providing '--request-parameter "
"max_tokens:256:int' to the command line will set an additional "
"parameter 'max_tokens' of type 'int' to 256 as part of the "
"request. The --request-parameter may be specified multiple times "
"for different custom parameters.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --request-rate-range <start:end:step>: Determines the range of "
Expand Down Expand Up @@ -806,6 +872,9 @@ CLParser::ParseCommandLine(int argc, char** argv)
{"output-tensor-format", required_argument, 0, 56},
{"version", no_argument, 0, 57},
{"profile-export-file", required_argument, 0, 58},
{"periodic-concurrency-range", required_argument, 0, 59},
{"request-period", required_argument, 0, 60},
{"request-parameter", required_argument, 0, 61},
{0, 0, 0, 0}};

// Parse commandline...
Expand Down Expand Up @@ -895,37 +964,23 @@ CLParser::ParseCommandLine(int argc, char** argv)
case 7: {
params_->using_concurrency_range = true;
std::string arg = optarg;
size_t pos = 0;
int index = 0;
while (pos != std::string::npos) {
size_t colon_pos = arg.find(":", pos);
if (index > 2) {
Usage(
"Failed to parse --concurrency-range. The value does not "
"match <start:end:step>.");
}
int64_t val;
if (colon_pos == std::string::npos) {
val = std::stoull(arg.substr(pos, colon_pos));
pos = colon_pos;
} else {
val = std::stoull(arg.substr(pos, colon_pos - pos));
pos = colon_pos + 1;
}
switch (index) {
case 0:
params_->concurrency_range.start = val;
break;
case 1:
params_->concurrency_range.end = val;
break;
case 2:
params_->concurrency_range.step = val;
break;
}
index++;
std::vector<std::string> values{SplitString(arg)};
if (values.size() > 3) {
Usage(
"Failed to parse --concurrency-range. The value does not match "
"<start:end:step>.");
}

for (size_t i = 0; i < values.size(); ++i) {
uint64_t val = std::stoull(values[i]);
if (i == 0) {
params_->concurrency_range.start = val;
} else if (i == 1) {
params_->concurrency_range.end = val;
} else if (i == 2) {
params_->concurrency_range.step = val;
}
}
break;
}
case 8:
Expand Down Expand Up @@ -1482,6 +1537,88 @@ CLParser::ParseCommandLine(int argc, char** argv)
params_->profile_export_file = profile_export_file;
break;
}
case 59: {
params_->is_using_periodic_concurrency_mode = true;
std::string arg = optarg;
std::vector<std::string> values{SplitString(arg)};
if (values.size() < 2) {
Usage(
"Failed to parse --periodic-concurrency-range. Both <start> "
"and <end> values must be provided.");
} else if (values.size() > 3) {
Usage(
"Failed to parse --periodic-concurrency-range. The value does "
"not match <start:end:step>.");
}

for (size_t i = 0; i < values.size(); ++i) {
uint64_t val = std::stoull(values[i]);
if (i == 0) {
params_->periodic_concurrency_range.start = val;
} else if (i == 1) {
params_->periodic_concurrency_range.end = val;
} else if (i == 2) {
params_->periodic_concurrency_range.step = val;
}
}

Range<uint64_t> range{params_->periodic_concurrency_range};
if (range.step == 0) {
Usage(
"Failed to parse --periodic-concurrency-range. The <step> "
"value must be > 0.");
} else if (range.start > range.end) {
Usage(
"Failed to parse --periodic-concurrency-range. The <start> "
"must be <= <end>.");
} else if ((range.end - range.start) % range.step != 0) {
Usage(
"Failed to parse --periodic-concurrency-range. The <step> "
"value must be a factor of the range size (<end> - <start>).");
}
break;
}
case 60: {
std::string request_period{optarg};
if (std::stoi(request_period) > 0) {
params_->request_period = std::stoull(request_period);
} else {
Usage("Failed to parse --request-period. The value must be > 0");
}
break;
}
case 61: {
std::string arg = optarg;
std::vector<std::string> values{SplitString(arg)};
if (values.size() != 3) {
Usage(
"Failed to parse --request-parameter. The value does not match "
"<name:value:type>.");
}

std::for_each(values.begin(), values.end(), ToLowerCase);
std::string name{values[0]};
std::string value{values[1]};
std::string type{values[2]};

RequestParameter param;
if (type == "bool") {
param.type = RequestParameterType::BOOL;
param.bool_value = value == "true" ? true : false;
} else if (type == "int") {
param.type = RequestParameterType::INT;
param.int_value = std::stoll(value);
} else if (type == "string") {
param.type = RequestParameterType::STRING;
param.str_value = value;
} else {
Usage(
"Failed to parse --request-parameter. Unsupported type: '" +
type + "'.");
}
params_->request_parameters[name] = param;
break;
}
case 'v':
params_->extra_verbose = params_->verbose;
params_->verbose = true;
Expand Down Expand Up @@ -1639,10 +1776,36 @@ CLParser::VerifyOptions()
Usage("Cannot use concurrency options with --request-rate-range.");
}

if (params_->using_request_rate_range && params_->using_concurrency_range) {
std::vector<bool> load_modes{
params_->is_using_periodic_concurrency_mode,
params_->using_concurrency_range, params_->using_request_rate_range,
params_->using_custom_intervals};
if (std::count(load_modes.begin(), load_modes.end(), true) > 1) {
Usage(
"Cannot specify more then one inference load mode. Please choose only "
"one of the following modes: --concurrency-range, "
"--periodic-concurrency-range, --request-rate-range, or "
"--request-intervals.");
}

if (params_->is_using_periodic_concurrency_mode && !params_->streaming) {
Usage(
"The --periodic-concurrency-range option requires bi-directional gRPC "
"streaming.");
}

if (params_->is_using_periodic_concurrency_mode &&
(params_->profile_export_file == "")) {
Usage(
"Must provide --profile-export-file when using the "
"--periodic-concurrency-range option.");
}

if (params_->request_parameters.size() > 0 &&
params_->protocol != cb::ProtocolType::GRPC) {
Usage(
"Cannot specify --concurrency-range and --request-rate-range "
"simultaneously.");
"The --request-parameter option is currently only supported by gRPC "
"protocol.");
}

if (params_->using_request_rate_range && params_->mpi_driver->IsMPIRun() &&
Expand Down
4 changes: 2 additions & 2 deletions src/c++/perf_analyzer/command_line_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ struct PerfAnalyzerParameters {
uint64_t measurement_window_ms = 5000;
bool using_concurrency_range = false;
Range<uint64_t> concurrency_range{1, 1, 1};
std::unordered_map<std::string, RequestParameter> request_parameters;
uint64_t latency_threshold_ms = NO_LIMIT;
double stability_threshold = 0.1;
size_t max_trials = 10;
Expand Down Expand Up @@ -151,9 +152,8 @@ struct PerfAnalyzerParameters {
std::string profile_export_file{""};

bool is_using_periodic_concurrency_mode{false};

Range<uint64_t> periodic_concurrency_range{1, 1, 1};
uint64_t periodic_concurrency_request_period{10};
uint64_t request_period{10};
};

using PAParamsPtr = std::shared_ptr<PerfAnalyzerParameters>;
Expand Down
42 changes: 40 additions & 2 deletions src/c++/perf_analyzer/docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,51 @@ Specifies the range of concurrency levels covered by Perf Analyzer. Perf
Analyzer will start from the concurrency level of 'start' and go until 'end'
with a stride of 'step'.

Default of 'end' and 'step' are `1`. If 'end' is not specified then Perf
Analyzer will run for a single concurrency level determined by 'start'. If
Default of 'start', 'end', and 'step' are `1`. If 'end' is not specified then
Perf Analyzer will run for a single concurrency level determined by 'start'. If
'end' is set as `0`, then the concurrency limit will be incremented by 'step'
until the latency threshold is met. 'end' and `--latency-threshold` cannot
both be `0`. 'end' cannot be `0` for sequence models while using asynchronous
mode.

#### `--periodic-concurrency-range=<start:end:step>`

Specifies the range of concurrency levels in the similar but slightly different
manner as the `--concurrency-range`. Perf Analyzer will start from the
concurrency level of 'start' and increase by 'step' each time. Unlike
`--concurrency-range`, the 'end' indicates the *total* number of concurrency
since the 'start' (including) and will stop increasing once the cumulative
number of concurrent requests has reached the 'end'. The user can specify
*when* to periodically increase the concurrency level using the
`--request-period` option. The concurrency level will periodically increase for
every `n`-th response specified by `--request-period`. Since this disables
stability check in Perf Analyzer and reports response timestamps only, the user
must provide `--profile-export-file` to specify where to dump all the measured
timestamps.

The default values of 'start', 'end', and 'step' are `1`.

#### `--request-period=<n>`

Specifies the number of responses that each request must receive before new,
concurrent requests are sent when `--periodic-concurrency-range` is specified.

Default value is `10`.

#### `--request-parameter=<name:value:type>`

Specifies a custom parameter that can be sent to a Triton backend as part of
the request. For example, providing '--request-parameter max_tokens:256:int'
to the command line will set an additional parameter 'max_tokens' of type
'int' to 256 as part of the request. The --request-parameter may be specified
multiple times for different custom parameters.

Valid `type` values are: `bool`, `int`, and `string`.

> **NOTE**
>
> The `--request-parameter` is currently only supported by gRPC protocol.
#### `--request-rate-range=<start:end:step>`

Specifies the range of request rates for load generated by Perf Analyzer. This
Expand Down
10 changes: 1 addition & 9 deletions src/c++/perf_analyzer/perf_analyzer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,6 @@ PerfAnalyzer::CreateAnalyzerObjects()
}

std::unique_ptr<pa::LoadManager> manager;
params_->is_using_periodic_concurrency_mode = true;
params_->periodic_concurrency_range = {
std::stoi(std::getenv("MY_START")), std::stoi(std::getenv("MY_END")),
std::stoi(std::getenv("MY_STEP"))};
params_->periodic_concurrency_request_period =
std::stoi(std::getenv("MY_REQUEST_PERIOD"));

if (params_->targeting_concurrency()) {
if ((parser_->SchedulerType() == pa::ModelParser::SEQUENCE) ||
(parser_->SchedulerType() == pa::ModelParser::ENSEMBLE_SEQUENCE)) {
Expand Down Expand Up @@ -221,8 +214,7 @@ PerfAnalyzer::CreateAnalyzerObjects()
params_->async, params_->streaming, params_->batch_size,
params_->max_threads, params_->max_concurrency,
params_->shared_memory_type, params_->output_shm_size, parser_, factory,
params_->periodic_concurrency_range,
params_->periodic_concurrency_request_period);
params_->periodic_concurrency_range, params_->request_period);
} else if (params_->using_request_rate_range) {
if ((params_->sequence_id_range != 0) &&
(params_->sequence_id_range < params_->num_of_sequences)) {
Expand Down
9 changes: 9 additions & 0 deletions src/c++/perf_analyzer/perf_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,15 @@ class Range {
T step;
};

enum RequestParameterType { STRING = 0, INT = 1, BOOL = 2 };

struct RequestParameter {
std::string str_value;
int64_t int_value;
bool bool_value;
RequestParameterType type;
};

// Converts the datatype from tensorflow to perf analyzer space
// \param tf_dtype The data type string returned from the model metadata.
// \param datatype Returns the datatype in perf_analyzer space.
Expand Down
Loading

0 comments on commit 2b522e6

Please sign in to comment.