diff --git a/model_analyzer/config/generate/generator_utils.py b/model_analyzer/config/generate/generator_utils.py index 1f0e9c5eb..ceef010ca 100755 --- a/model_analyzer/config/generate/generator_utils.py +++ b/model_analyzer/config/generate/generator_utils.py @@ -108,6 +108,8 @@ def generate_doubled_list(min_value: int, max_value: int) -> List[int]: The value that the generated list will not exceed """ + assert min_value <= max_value + list = [] val = 1 if min_value == 0 else min_value while val <= max_value: diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index f17c2bc18..771e895f1 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -20,7 +20,14 @@ from typing import Dict, Generator, List, Optional, Tuple from model_analyzer.config.input.config_command_profile import ConfigCommandProfile -from model_analyzer.config.input.config_defaults import DEFAULT_INPUT_JSON_PATH +from model_analyzer.config.input.config_defaults import ( + DEFAULT_INPUT_JSON_PATH, + DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, + DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT, + DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY, + DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE, + DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH, +) from model_analyzer.constants import ( LOGGER_NAME, THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, @@ -211,7 +218,9 @@ def _create_inference_load_list(self) -> List[int]: # The two possible inference loads are request rate or concurrency # Concurrency is the default and will be used unless the user specifies # request rate, either as a model parameter or a config option - if self._cli_config.is_request_rate_specified(self._model_parameters): + if self._cli_config.is_llm_model(): + return self._create_periodic_concurrency_list() + elif self._cli_config.is_request_rate_specified(self._model_parameters): return self._create_request_rate_list() else: return self._create_concurrency_list() @@ -220,7 +229,7 @@ def _create_request_rate_list(self) -> List[int]: if self._model_parameters["request_rate"]: return sorted(self._model_parameters["request_rate"]) elif self._cli_config.run_config_search_disable: - return [1] + return [DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE] else: return utils.generate_doubled_list( self._cli_config.run_config_search_min_request_rate, @@ -231,13 +240,24 @@ def _create_concurrency_list(self) -> List[int]: if self._model_parameters["concurrency"]: return sorted(self._model_parameters["concurrency"]) elif self._cli_config.run_config_search_disable: - return [1] + return [DEFAULT_RUN_CONFIG_MIN_CONCURRENCY] else: return utils.generate_doubled_list( self._cli_config.run_config_search_min_concurrency, self._cli_config.run_config_search_max_concurrency, ) + def _create_periodic_concurrency_list(self) -> List[int]: + if self._model_parameters["periodic_concurrency"]: + return sorted(self._model_parameters["periodic_concurrency"]) + elif self._cli_config.run_config_search_disable: + return [DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY] + else: + return utils.generate_doubled_list( + self._cli_config.run_config_search_min_periodic_concurrency, + self._cli_config.run_config_search_max_periodic_concurrency, + ) + def _create_text_input_length_list(self) -> List[int]: if not self._cli_config.is_llm_model(): return [] @@ -245,7 +265,7 @@ def _create_text_input_length_list(self) -> List[int]: if self._model_parameters["text_input_length"]: return sorted(self._model_parameters["text_input_length"]) elif self._cli_config.run_config_search_disable: - return [1] + return [DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH] else: return utils.generate_doubled_list( self._cli_config.run_config_search_min_text_input_length, @@ -259,11 +279,11 @@ def _create_max_token_count_list(self) -> List[int]: if self._model_parameters["max_token_count"]: return sorted(self._model_parameters["max_token_count"]) elif self._cli_config.run_config_search_disable: - return [1] + return [DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT] else: return utils.generate_doubled_list( - self._cli_config.run_config_search_min_token_count, - self._cli_config.run_config_search_max_token_count, + self._cli_config.run_config_search_min_max_token_count, + self._cli_config.run_config_search_max_max_token_count, ) def _generate_perf_configs(self) -> None: diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index a215a2251..bdce45027 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -17,6 +17,7 @@ import argparse import logging import os +from typing import Dict import numba.cuda import psutil @@ -497,7 +498,9 @@ def _add_profile_models_configs(self): schema={ "batch_sizes": ConfigListNumeric(type_=int), "concurrency": ConfigListNumeric(type_=int), + "periodic_concurrency": ConfigListNumeric(type_=int), "request_rate": ConfigListNumeric(type_=int), + "request_period": ConfigListNumeric(type_=int), "text_input_length": ConfigListNumeric(type_=int), "max_token_count": ConfigListNumeric(type_=int), } @@ -562,6 +565,15 @@ def _add_profile_models_configs(self): " to be used during profiling", ) ) + self._add_config( + ConfigField( + "periodic_concurrency", + flags=["--periodic-concurrency"], + field_type=ConfigListNumeric(int), + description="Comma-delimited list of periodic concurrency values or ranges " + " to be used during profiling", + ) + ) self._add_config( ConfigField( "request_rate", @@ -571,6 +583,15 @@ def _add_profile_models_configs(self): " to be used during profiling", ) ) + self._add_config( + ConfigField( + "request_period", + flags=["--request-period"], + field_type=ConfigListNumeric(int), + description="Comma-delimited list of request period values or ranges " + " to be used during profiling", + ) + ) self._add_config( ConfigField( "text_input_length", @@ -687,7 +708,7 @@ def _add_run_search_configs(self): flags=["--run-config-search-max-concurrency"], field_type=ConfigPrimitive(int), default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_CONCURRENCY, - description="Max concurrency value that run config search should not go beyond that.", + description="Max concurrency value that run config search should not go beyond.", ) ) self._add_config( @@ -699,13 +720,49 @@ def _add_run_search_configs(self): description="Min concurrency value that run config search should start with.", ) ) + self._add_config( + ConfigField( + "run_config_search_max_periodic_concurrency", + flags=["--run-config-search-max-periodic-concurrency"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY, + description="Max periodic concurrency value that run config search should not go beyond.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_min_periodic_concurrency", + flags=["--run-config-search-min-periodic-concurrency"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY, + description="Min periodic concurrency value that run config search should start with.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_max_periodic_concurrency_step", + flags=["--run-config-search-max-periodic-concurrency-step"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY_STEP, + description="Max periodic concurrency step value that run config search should not go beyond.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_min_periodic_concurrency_step", + flags=["--run-config-search-min-periodic-concurrency-step"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY_STEP, + description="Min periodic concurrency step value that run config search should start with.", + ) + ) self._add_config( ConfigField( "run_config_search_max_request_rate", flags=["--run-config-search-max-request-rate"], field_type=ConfigPrimitive(int), default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE, - description="Max request rate value that run config search should not go beyond that.", + description="Max request rate value that run config search should not go beyond.", ) ) self._add_config( @@ -717,13 +774,31 @@ def _add_run_search_configs(self): description="Min request rate value that run config search should start with.", ) ) + self._add_config( + ConfigField( + "run_config_search_max_request_period", + flags=["--run-config-search-max-request-period"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_PERIOD, + description="Max request period value that run config search should not go beyond.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_min_request_period", + flags=["--run-config-search-min-request-period"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD, + description="Min request period value that run config search should start with.", + ) + ) self._add_config( ConfigField( "run_config_search_max_instance_count", flags=["--run-config-search-max-instance-count"], field_type=ConfigPrimitive(int), default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT, - description="Max instance count value that run config search should not go beyond that.", + description="Max instance count value that run config search should not go beyond.", ) ) self._add_config( @@ -836,20 +911,20 @@ def _add_run_search_configs(self): ) self._add_config( ConfigField( - "run_config_search_min_token_count", - flags=["--run-config-search-min-token-count"], + "run_config_search_min_max_token_count", + flags=["--run-config-search-min-max-token-count"], field_type=ConfigPrimitive(int), - default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT, - description="Min token count that run config search should start with.", + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT, + description="Min max_token count that run config search should start with.", ) ) self._add_config( ConfigField( - "run_config_search_max_token_count", - flags=["--run-config-search-max-token-count"], + "run_config_search_max_max_token_count", + flags=["--run-config-search-max-max-token-count"], field_type=ConfigPrimitive(int), - default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT, - description="Max token count that run config search will not go beyond.", + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_MAX_TOKEN_COUNT, + description="Max max_token count that run config search will not go beyond.", ) ) @@ -1420,50 +1495,35 @@ def _autofill_values(self): new_model["parameters"] = { "batch_sizes": self.batch_sizes, "concurrency": self.concurrency, + "periodic_concurrency": self.periodic_concurrency, "request_rate": self.request_rate, + "request_period": self.request_period, "text_input_length": self.text_input_length, "max_token_count": self.max_token_count, } else: new_model["parameters"] = {} - if "batch_sizes" in model.parameters(): - new_model["parameters"].update( - {"batch_sizes": model.parameters()["batch_sizes"]} - ) - else: - new_model["parameters"].update({"batch_sizes": self.batch_sizes}) - - if "concurrency" in model.parameters(): - new_model["parameters"].update( - {"concurrency": model.parameters()["concurrency"]} - ) - else: - new_model["parameters"].update({"concurrency": self.concurrency}) - - if "request_rate" in model.parameters(): - new_model["parameters"].update( - {"request_rate": model.parameters()["request_rate"]} - ) - else: - new_model["parameters"].update({"request_rate": self.request_rate}) - - if "text_input_length" in model.parameters(): - new_model["parameters"].update( - {"text_input_length": model.parameters()["text_input_length"]} - ) - else: - new_model["parameters"].update( - {"text_input_length": self.text_input_length} - ) - - if "max_token_count" in model.parameters(): - new_model["max_token_count"].update( - {"max_token_count": model.parameters()["max_token_count"]} - ) - else: - new_model["parameters"].update( - {"max_token_count": self.text_input_length} - ) + new_model["parameters"].update( + self._set_model_parameter(model, "batch_sizes") + ) + new_model["parameters"].update( + self._set_model_parameter(model, "concurrency") + ) + new_model["parameters"].update( + self._set_model_parameter(model, "periodic_concurrency") + ) + new_model["parameters"].update( + self._set_model_parameter(model, "request_rate") + ) + new_model["parameters"].update( + self._set_model_parameter(model, "request_period") + ) + new_model["parameters"].update( + self._set_model_parameter(model, "max_token_count") + ) + new_model["parameters"].update( + self._set_model_parameter(model, "text_input_length") + ) if ( new_model["parameters"]["request_rate"] @@ -1506,6 +1566,14 @@ def _autofill_values(self): new_profile_models[model.model_name()] = new_model self._fields["profile_models"].set_value(new_profile_models) + def _set_model_parameter( + self, model: ConfigModelProfileSpec, parameter_name: str + ) -> Dict: + if parameter_name in model.parameters(): + return {parameter_name: model.parameters()[parameter_name]} + else: + return {parameter_name: getattr(self, parameter_name)} + def _using_request_rate(self) -> bool: if self.request_rate or self.request_rate_search_enable: return True @@ -1550,16 +1618,26 @@ def is_llm_model(self) -> bool: """ Returns true if the user has enabled llm search or set any llm search value """ + config = self.get_config() + return ( self.llm_search_enable - or self.get_config()[ - "run_config_search_min_text_input_length" + or config["run_config_search_min_text_input_length"].is_set_by_user() + or config["run_config_search_max_text_input_length"].is_set_by_user() + or config["run_config_search_min_max_token_count"].is_set_by_user() + or config["run_config_search_max_max_token_count"].is_set_by_user() + or config["run_config_search_min_periodic_concurrency"].is_set_by_user() + or config["run_config_search_max_periodic_concurrency"].is_set_by_user() + or config[ + "run_config_search_min_periodic_concurrency_step" ].is_set_by_user() - or self.get_config()[ - "run_config_search_max_text_input_length" + or config[ + "run_config_search_max_periodic_concurrency_step" ].is_set_by_user() - or self.get_config()["run_config_search_min_token_count"].is_set_by_user() - or self.get_config()["run_config_search_max_token_count"].is_set_by_user() - or self.get_config()["text_input_length"].is_set_by_user() - or self.get_config()["max_token_count"].is_set_by_user() + or config["run_config_search_min_request_period"].is_set_by_user() + or config["run_config_search_max_request_period"].is_set_by_user() + or config["text_input_length"].is_set_by_user() + or config["max_token_count"].is_set_by_user() + or config["periodic_concurrency"].is_set_by_user() + or config["request_period"].is_set_by_user() ) diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py index 7e37f7c7d..bab62a4fd 100755 --- a/model_analyzer/config/input/config_defaults.py +++ b/model_analyzer/config/input/config_defaults.py @@ -45,8 +45,14 @@ DEFAULT_CLIENT_PROTOCOL = "grpc" DEFAULT_RUN_CONFIG_MAX_CONCURRENCY = 1024 DEFAULT_RUN_CONFIG_MIN_CONCURRENCY = 1 +DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY = 1024 +DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY = 16 +DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY_STEP = 128 +DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY_STEP = 4 DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE = 8192 DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE = 16 +DEFAULT_RUN_CONFIG_MAX_REQUEST_PERIOD = 256 +DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD = 1 DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT = 5 DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT = 1 DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1 @@ -54,8 +60,8 @@ DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5 DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH = 1 DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH = 1024 -DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT = 1 -DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT = 256 +DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT = 1 +DEFAULT_RUN_CONFIG_MAX_MAX_TOKEN_COUNT = 256 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False DEFAULT_RUN_CONFIG_SEARCH_MODE = "brute" DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE = False diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index caa9763ce..d6e42fadc 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -27,6 +27,9 @@ DEFAULT_MEASUREMENT_MODE, DEFAULT_MONITORING_INTERVAL, DEFAULT_OUTPUT_MODEL_REPOSITORY, + DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, + DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT, + DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY, DEFAULT_TRITON_GRPC_ENDPOINT, DEFAULT_TRITON_HTTP_ENDPOINT, DEFAULT_TRITON_INSTALL_PATH, @@ -236,9 +239,10 @@ def construct_perf_analyzer_config( model_name="my-model", output_file_name="my-model-results.csv", batch_size=DEFAULT_BATCH_SIZES, - concurrency=1, + concurrency=DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, + periodic_concurrency=DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY, request_rate=None, - max_token_count=1, + max_token_count=DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT, launch_mode=DEFAULT_TRITON_LAUNCH_MODE, client_protocol=DEFAULT_CLIENT_PROTOCOL, perf_analyzer_flags=None, @@ -257,6 +261,8 @@ def construct_perf_analyzer_config( The batch size for this PA configuration concurrency: int The concurrency value for this PA configuration + periodic_concurrency: + The periodic concurrency value for this PA configuration request_rate: int The request rate value for this PA configuration launch_mode: str @@ -282,7 +288,7 @@ def construct_perf_analyzer_config( if request_rate: pa_config._args["request-rate-range"] = request_rate elif llm_search_mode: - pa_config._args["periodic-concurrency-range"] = concurrency + pa_config._args["periodic-concurrency-range"] = periodic_concurrency else: pa_config._args["concurrency-range"] = concurrency diff --git a/tests/test_cli.py b/tests/test_cli.py index 94dbf0b21..c6669b2c2 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -78,6 +78,12 @@ def get_test_options(): OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", str(config_defaults.DEFAULT_PERF_MAX_AUTO_ADJUSTS)), OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_CONCURRENCY)), OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_CONCURRENCY)), + OptionStruct("int", "profile", "--run-config-search-min-periodic-concurrency", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY)), + OptionStruct("int", "profile", "--run-config-search-max-periodic-concurrency", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY)), + OptionStruct("int", "profile", "--run-config-search-min-periodic-concurrency-step", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY_STEP)), + OptionStruct("int", "profile", "--run-config-search-max-periodic-concurrency-step", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY_STEP)), + OptionStruct("int", "profile", "--run-config-search-min-request-period", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD)), + OptionStruct("int", "profile", "--run-config-search-max-request-period", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_PERIOD)), OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE)), OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE)), OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE)), @@ -87,8 +93,8 @@ def get_test_options(): OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS)), OptionStruct("int", "profile", "--run-config-search-min-text-input-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH)), OptionStruct("int", "profile", "--run-config-search-max-text-input-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH)), - OptionStruct("int", "profile", "--run-config-search-min-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT)), - OptionStruct("int", "profile", "--run-config-search-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT)), + OptionStruct("int", "profile", "--run-config-search-min-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT)), + OptionStruct("int", "profile", "--run-config-search-max-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_MAX_TOKEN_COUNT)), OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", str(config_defaults.DEFAULT_MONITORING_INTERVAL)), OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * config_defaults.DEFAULT_PERF_ANALYZER_CPU_UTIL)), OptionStruct("int", "profile", "--num-configs-per-model", None, "10", str(config_defaults.DEFAULT_NUM_CONFIGS_PER_MODEL)), @@ -139,7 +145,9 @@ def get_test_options(): # expected_default_value OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"), OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None), + OptionStruct("intlist", "profile", "--periodic-concurrency", None, "1, 2, 3", None), OptionStruct("intlist", "profile", "--request-rate", None, "1, 2, 3", None), + OptionStruct("intlist", "profile", "--request-period", None, "1, 2, 3", None), OptionStruct("intlist", "profile", "--text-input-length", None, "1, 2, 3", None), OptionStruct("intlist", "profile", "--max-token-count", None, "1, 2, 3", None), OptionStruct("stringlist", "profile", "--triton-docker-mounts", None, "a:b:c, d:e:f", None, extra_commands=["--triton-launch-mode", "docker"]), diff --git a/tests/test_config.py b/tests/test_config.py index 01dc739d8..72af999fe 100755 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -47,6 +47,26 @@ class TestConfig(trc.TestResultCollector): + def _create_parameters( + self, + batch_sizes: List = [], + concurrency: List = [], + periodic_concurrency: List = [], + request_rate: List = [], + request_period: List = [], + text_input_length: List = [], + max_token_count: List = [], + ) -> Dict: + return { + "batch_sizes": batch_sizes, + "concurrency": concurrency, + "periodic_concurrency": periodic_concurrency, + "request_rate": request_rate, + "request_period": request_period, + "text_input_length": text_input_length, + "max_token_count": max_token_count, + } + def _evaluate_config(self, args, yaml_content, subcommand="profile"): mock_numba = MockNumba( mock_paths=["model_analyzer.config.input.config_command_profile"] @@ -288,24 +308,12 @@ def test_range_and_list_values(self): expected_model_configs = [ ConfigModelProfileSpec( "model_1", - parameters={ - "batch_sizes": [1], - "concurrency": [], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, ), ConfigModelProfileSpec( "model_2", - parameters={ - "batch_sizes": [1], - "concurrency": [], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, ), ] @@ -438,24 +446,14 @@ def test_object(self): expected_model_objects = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [1, 2, 3, 4], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters( + batch_sizes=[1], concurrency=[1, 2, 3, 4] + ), objectives={"perf_throughput": 10}, ), ConfigModelProfileSpec( "vgg_19_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, ), ] @@ -505,24 +503,16 @@ def test_object(self): expected_model_objects = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [1, 2, 3, 4], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters( + batch_sizes=[1], concurrency=[1, 2, 3, 4] + ), objectives={"perf_throughput": 10}, ), ConfigModelProfileSpec( "vgg_19_graphdef", - parameters={ - "concurrency": [1, 2, 3, 4], - "batch_sizes": [2, 4, 6], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters( + batch_sizes=[2, 4, 6], concurrency=[1, 2, 3, 4] + ), objectives={"perf_throughput": 10}, ), ] @@ -589,13 +579,9 @@ def test_constraints(self): expected_model_objects = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [1, 2, 3, 4], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters( + batch_sizes=[1], concurrency=[1, 2, 3, 4] + ), objectives={"perf_throughput": 10, "gpu_used_memory": 5}, constraints={ "gpu_used_memory": { @@ -605,13 +591,7 @@ def test_constraints(self): ), ConfigModelProfileSpec( "vgg_19_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, ), ] @@ -729,13 +709,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]] @@ -760,13 +734,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]] @@ -802,13 +770,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [ @@ -851,13 +813,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [ @@ -887,13 +843,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, model_config_parameters={ "input": [ @@ -936,13 +886,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, perf_analyzer_flags={ "measurement-interval": 10000, @@ -968,13 +912,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, perf_analyzer_flags={ "measurement-interval": 10000, @@ -1245,13 +1183,7 @@ def test_autofill(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]] @@ -1291,13 +1223,9 @@ def test_autofill(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [16, 32], - "concurrency": [2, 4], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters( + batch_sizes=[16, 32], concurrency=[2, 4] + ), objectives={"perf_throughput": 10, "gpu_used_memory": 5}, constraints={ "gpu_used_memory": { @@ -1341,13 +1269,9 @@ def test_autofill(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [16, 32], - "concurrency": [2, 4], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters( + batch_sizes=[16, 32], concurrency=[2, 4] + ), objectives={"gpu_used_memory": 10}, constraints={"perf_latency_p99": {"max": 8000}}, model_config_parameters={ @@ -1387,13 +1311,9 @@ def test_autofill(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [16, 32], - "concurrency": [2, 4], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters( + batch_sizes=[16, 32], concurrency=[2, 4] + ), objectives={"gpu_used_memory": 10}, constraints={"perf_latency_p99": {"max": 8000}}, model_config_parameters={ @@ -1444,13 +1364,9 @@ def test_autofill(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [16, 32], - "concurrency": [5, 6, 7], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters( + batch_sizes=[16, 32], concurrency=[5, 6, 7] + ), objectives={"gpu_used_memory": 10}, constraints={ "perf_latency_p99": {"max": 8000}, @@ -1459,13 +1375,9 @@ def test_autofill(self): ), ConfigModelProfileSpec( "vgg_19_graphdef", - parameters={ - "batch_sizes": [1, 2], - "concurrency": [2, 4], - "request_rate": [], - "text_input_length": [], - "max_token_count": [], - }, + parameters=self._create_parameters( + batch_sizes=[1, 2], concurrency=[2, 4] + ), objectives={"perf_throughput": 10, "perf_latency_p99": 5}, constraints={"perf_latency_p99": {"max": 8000}}, ), diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py index 69e42ef8d..f00084335 100755 --- a/tests/test_perf_analyzer_config_generator.py +++ b/tests/test_perf_analyzer_config_generator.py @@ -584,8 +584,8 @@ def test_llm_search_max_token_count(self): pa_cli_args = [ "--llm-search-enable", - "--run-config-search-max-concurrency", - "1", + "--run-config-search-max-periodic-concurrency", + "16", "--run-config-search-max-text-input-length", "1", ] @@ -598,7 +598,7 @@ def test_llm_search_text_input_length(self): Test LLM Search: - Input length 1->1024 - Concurrency and max token count set to 1 + Periodic Concurrency and max token count set to 1 """ # yapf: disable @@ -618,9 +618,9 @@ def test_llm_search_text_input_length(self): pa_cli_args = [ "--llm-search-enable", - "--run-config-search-max-concurrency", - "1", - "--run-config-search-max-token-count", + "--run-config-search-max-periodic-concurrency", + "16", + "--run-config-search-max-max-token-count", "1", ] self._run_and_test_perf_analyzer_config_generator(