From c9d467f74717fd800b0ad6b60be0220bf9bcd74b Mon Sep 17 00:00:00 2001 From: Brian Raf <92820864+nv-braf@users.noreply.github.com> Date: Tue, 3 Oct 2023 08:05:04 -0700 Subject: [PATCH] Adding new options for LLM (#768) * Update README and versions for 23.09 branch (#761) (#767) * Adding new options for LLM * Fixing codeQL issues * Fixing codeQL issue --------- Co-authored-by: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> --- .../config/input/config_command_profile.py | 234 ++++++++++-------- .../config/input/config_defaults.py | 5 + tests/test_cli.py | 45 ++-- 3 files changed, 155 insertions(+), 129 deletions(-) diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index 02d6def28..9c40f16ef 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -23,6 +23,7 @@ from google.protobuf.descriptor import FieldDescriptor from tritonclient.grpc.model_config_pb2 import ModelConfig +import model_analyzer.config.input.config_defaults as config_defaults from model_analyzer.config.input.config_utils import ( binary_path_validator, file_path_validator, @@ -36,62 +37,6 @@ from model_analyzer.triton.server.server_config import TritonServerConfig from .config_command import ConfigCommand -from .config_defaults import ( - DEFAULT_ALWAYS_REPORT_GPU_METRICS, - DEFAULT_BATCH_SIZES, - DEFAULT_CHECKPOINT_DIRECTORY, - DEFAULT_CLIENT_PROTOCOL, - DEFAULT_COLLECT_CPU_METRICS, - DEFAULT_DURATION_SECONDS, - DEFAULT_EXPORT_PATH, - DEFAULT_FILENAME_MODEL_GPU, - DEFAULT_FILENAME_MODEL_INFERENCE, - DEFAULT_FILENAME_SERVER_ONLY, - DEFAULT_GPU_OUTPUT_FIELDS, - DEFAULT_GPUS, - DEFAULT_INFERENCE_OUTPUT_FIELDS, - DEFAULT_MAX_RETRIES, - DEFAULT_MODEL_WEIGHTING, - DEFAULT_MONITORING_INTERVAL, - DEFAULT_NUM_CONFIGS_PER_MODEL, - DEFAULT_NUM_TOP_MODEL_CONFIGS, - DEFAULT_OFFLINE_OBJECTIVES, - DEFAULT_OFFLINE_PLOTS, - DEFAULT_ONLINE_OBJECTIVES, - DEFAULT_ONLINE_PLOTS, - DEFAULT_OUTPUT_MODEL_REPOSITORY, - DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG, - DEFAULT_PERF_ANALYZER_CPU_UTIL, - DEFAULT_PERF_ANALYZER_PATH, - DEFAULT_PERF_ANALYZER_TIMEOUT, - DEFAULT_PERF_MAX_AUTO_ADJUSTS, - DEFAULT_PERF_OUTPUT_FLAG, - DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS, - DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS, - DEFAULT_REQUEST_RATE_SEARCH_ENABLE, - DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS, - DEFAULT_RUN_CONFIG_MAX_CONCURRENCY, - DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT, - DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE, - DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE, - DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, - DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT, - DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE, - DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE, - DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE, - DEFAULT_RUN_CONFIG_SEARCH_DISABLE, - DEFAULT_RUN_CONFIG_SEARCH_MODE, - DEFAULT_SERVER_OUTPUT_FIELDS, - DEFAULT_SKIP_DETAILED_REPORTS, - DEFAULT_SKIP_SUMMARY_REPORTS, - DEFAULT_TRITON_DOCKER_IMAGE, - DEFAULT_TRITON_GRPC_ENDPOINT, - DEFAULT_TRITON_HTTP_ENDPOINT, - DEFAULT_TRITON_INSTALL_PATH, - DEFAULT_TRITON_LAUNCH_MODE, - DEFAULT_TRITON_METRICS_URL, - DEFAULT_TRITON_SERVER_PATH, -) from .config_enum import ConfigEnum from .config_field import ConfigField from .config_list_generic import ConfigListGeneric @@ -224,7 +169,7 @@ def _fill_config(self): ConfigField( "checkpoint_directory", flags=["-s", "--checkpoint-directory"], - default_value=DEFAULT_CHECKPOINT_DIRECTORY, + default_value=config_defaults.DEFAULT_CHECKPOINT_DIRECTORY, field_type=ConfigPrimitive(str, validator=parent_path_validator), description="Full path to directory to which to read and write checkpoints and profile data.", ) @@ -234,7 +179,7 @@ def _fill_config(self): "monitoring_interval", flags=["-i", "--monitoring-interval"], field_type=ConfigPrimitive(float), - default_value=DEFAULT_MONITORING_INTERVAL, + default_value=config_defaults.DEFAULT_MONITORING_INTERVAL, description="Interval of time between metrics measurements in seconds", ) ) @@ -243,7 +188,7 @@ def _fill_config(self): "duration_seconds", field_type=ConfigPrimitive(int), flags=["-d", "--duration-seconds"], - default_value=DEFAULT_DURATION_SECONDS, + default_value=config_defaults.DEFAULT_DURATION_SECONDS, description="Specifies how long (seconds) to gather server-only metrics", ) ) @@ -253,7 +198,7 @@ def _fill_config(self): field_type=ConfigPrimitive(bool), flags=["--collect-cpu-metrics"], parser_args={"action": "store_true"}, - default_value=DEFAULT_COLLECT_CPU_METRICS, + default_value=config_defaults.DEFAULT_COLLECT_CPU_METRICS, description="Specify whether CPU metrics are collected or not", ) ) @@ -262,7 +207,7 @@ def _fill_config(self): "gpus", flags=["--gpus"], field_type=ConfigListString(), - default_value=DEFAULT_GPUS, + default_value=config_defaults.DEFAULT_GPUS, description="List of GPU UUIDs to be used for the profiling. " "Use 'all' to profile all the GPUs visible by CUDA.", ) @@ -273,7 +218,7 @@ def _fill_config(self): flags=["--always-report-gpu-metrics"], field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_ALWAYS_REPORT_GPU_METRICS, + default_value=config_defaults.DEFAULT_ALWAYS_REPORT_GPU_METRICS, description="Report GPU metrics, even when the model is `cpu_only`.", ) ) @@ -283,7 +228,7 @@ def _fill_config(self): flags=["--skip-summary-reports"], field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_SKIP_SUMMARY_REPORTS, + default_value=config_defaults.DEFAULT_SKIP_SUMMARY_REPORTS, description="Skips the generation of analysis summary reports and tables.", ) ) @@ -293,7 +238,7 @@ def _fill_config(self): flags=["--skip-detailed-reports"], field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_SKIP_DETAILED_REPORTS, + default_value=config_defaults.DEFAULT_SKIP_DETAILED_REPORTS, description="Skips the generation of detailed summary reports and tables.", ) ) @@ -325,7 +270,7 @@ def _add_repository_configs(self): ConfigField( "output_model_repository_path", field_type=ConfigPrimitive(str), - default_value=DEFAULT_OUTPUT_MODEL_REPOSITORY, + default_value=config_defaults.DEFAULT_OUTPUT_MODEL_REPOSITORY, flags=["--output-model-repository-path"], description="Output model repository path used by Model Analyzer." " This is the directory that will contain all the generated model configurations", @@ -336,7 +281,7 @@ def _add_repository_configs(self): "override_output_model_repository", field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG, + default_value=config_defaults.DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG, flags=["--override-output-model-repository"], description="Will override the contents of the output model repository" " and replace it with the new results.", @@ -520,7 +465,7 @@ def _add_profile_models_configs(self): ConfigField( "objectives", field_type=objectives_scheme, - default_value=DEFAULT_OFFLINE_OBJECTIVES, + default_value=config_defaults.DEFAULT_OFFLINE_OBJECTIVES, description="Model Analyzer uses the objectives described here to find the best configuration for each model.", ) ) @@ -602,7 +547,7 @@ def _add_profile_models_configs(self): "batch_sizes", flags=["-b", "--batch-sizes"], field_type=ConfigListNumeric(int), - default_value=DEFAULT_BATCH_SIZES, + default_value=config_defaults.DEFAULT_BATCH_SIZES, description="Comma-delimited list of batch sizes to use for the profiling", ) ) @@ -624,6 +569,24 @@ def _add_profile_models_configs(self): " to be used during profiling", ) ) + self._add_config( + ConfigField( + "prompt_length", + flags=["--prompt-length"], + field_type=ConfigListNumeric(int), + description="Comma-delimited list of prompt length values or ranges " + " to be used during profiling LLMs", + ) + ) + self._add_config( + ConfigField( + "max_token_count", + flags=["--max-token-count"], + field_type=ConfigListNumeric(int), + description="Comma-delimited list of max token values or ranges " + " to be used during profiling LLMs", + ) + ) self._add_config( ConfigField( "reload_model_disable", @@ -685,7 +648,7 @@ def _add_client_configs(self): "client_max_retries", flags=["-r", "--client-max-retries"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_MAX_RETRIES, + default_value=config_defaults.DEFAULT_MAX_RETRIES, description="Specifies the max number of retries for any requests to Triton server.", ) ) @@ -695,7 +658,7 @@ def _add_client_configs(self): flags=["--client-protocol"], choices=["http", "grpc"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_CLIENT_PROTOCOL, + default_value=config_defaults.DEFAULT_CLIENT_PROTOCOL, description="The protocol used to communicate with the Triton Inference Server", ) ) @@ -721,7 +684,7 @@ def _add_run_search_configs(self): "run_config_search_max_concurrency", flags=["--run-config-search-max-concurrency"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MAX_CONCURRENCY, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_CONCURRENCY, description="Max concurrency value that run config search should not go beyond that.", ) ) @@ -730,7 +693,7 @@ def _add_run_search_configs(self): "run_config_search_min_concurrency", flags=["--run-config-search-min-concurrency"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, description="Min concurrency value that run config search should start with.", ) ) @@ -739,7 +702,7 @@ def _add_run_search_configs(self): "run_config_search_max_request_rate", flags=["--run-config-search-max-request-rate"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE, description="Max request rate value that run config search should not go beyond that.", ) ) @@ -748,7 +711,7 @@ def _add_run_search_configs(self): "run_config_search_min_request_rate", flags=["--run-config-search-min-request-rate"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE, description="Min request rate value that run config search should start with.", ) ) @@ -757,7 +720,7 @@ def _add_run_search_configs(self): "run_config_search_max_instance_count", flags=["--run-config-search-max-instance-count"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT, description="Max instance count value that run config search should not go beyond that.", ) ) @@ -766,7 +729,7 @@ def _add_run_search_configs(self): "run_config_search_min_instance_count", flags=["--run-config-search-min-instance-count"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT, description="Min instance count value that run config search should start with.", ) ) @@ -775,7 +738,7 @@ def _add_run_search_configs(self): "run_config_search_max_model_batch_size", flags=["--run-config-search-max-model-batch-size"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE, description="Value for the model's max_batch_size that run config search will not go beyond.", ) ) @@ -784,7 +747,7 @@ def _add_run_search_configs(self): "run_config_search_min_model_batch_size", flags=["--run-config-search-min-model-batch-size"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE, description="Value for the model's max_batch_size that run config search will start from.", ) ) @@ -793,7 +756,7 @@ def _add_run_search_configs(self): "run_config_search_max_binary_search_steps", flags=["--run-config-search-max-binary-search-steps"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS, description="Maximum number of steps take during the binary concurrency search.", ) ) @@ -803,7 +766,7 @@ def _add_run_search_configs(self): flags=["--run-config-search-mode"], choices=["brute", "quick"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_RUN_CONFIG_SEARCH_MODE, + default_value=config_defaults.DEFAULT_RUN_CONFIG_SEARCH_MODE, description="The search mode for Model Analyzer to find and evaluate" " model configurations. 'brute' will brute force all combinations of" " configuration options. 'quick' will attempt to find a near-optimal" @@ -817,7 +780,7 @@ def _add_run_search_configs(self): flags=["--run-config-search-disable"], field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_RUN_CONFIG_SEARCH_DISABLE, + default_value=config_defaults.DEFAULT_RUN_CONFIG_SEARCH_DISABLE, description="Disable run config search.", ) ) @@ -827,7 +790,7 @@ def _add_run_search_configs(self): flags=["--run-config-profile-models-concurrently-enable"], field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE, + default_value=config_defaults.DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE, description="Enable the profiling of all supplied models concurrently.", ) ) @@ -837,10 +800,56 @@ def _add_run_search_configs(self): flags=["--request-rate-search-enable"], field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_REQUEST_RATE_SEARCH_ENABLE, + default_value=config_defaults.DEFAULT_REQUEST_RATE_SEARCH_ENABLE, description="Enables the searching of request rate (instead of concurrency).", ) ) + self._add_config( + ConfigField( + "llm_search_enable", + flags=["--llm-search-enable"], + field_type=ConfigPrimitive(bool), + parser_args={"action": "store_true"}, + default_value=config_defaults.DEFAULT_LLM_SEARCH_ENABLE, + description="Enables searching values are important to LLMs: prompt length, max token, etc...", + ) + ) + self._add_config( + ConfigField( + "run_config_search_min_prompt_length", + flags=["--run-config-search-min-prompt-length"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH, + description="Min prompt length that run config search should start with.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_max_prompt_length", + flags=["--run-config-search-max-prompt-length"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH, + description="Max prompt length that run config search will not go beyond.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_min_token_count", + flags=["--run-config-search-min-token-count"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT, + description="Min token count that run config search should start with.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_max_token_count", + flags=["--run-config-search-max-token-count"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT, + description="Max token count that run config search will not go beyond.", + ) + ) def _add_triton_configs(self): """ @@ -853,7 +862,7 @@ def _add_triton_configs(self): "triton_launch_mode", field_type=ConfigPrimitive(str), flags=["--triton-launch-mode"], - default_value=DEFAULT_TRITON_LAUNCH_MODE, + default_value=config_defaults.DEFAULT_TRITON_LAUNCH_MODE, choices=["local", "docker", "remote", "c_api"], description="The method by which to launch Triton Server. " "'local' assumes tritonserver binary is available locally. " @@ -869,7 +878,7 @@ def _add_triton_configs(self): "triton_docker_image", flags=["--triton-docker-image"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_TRITON_DOCKER_IMAGE, + default_value=config_defaults.DEFAULT_TRITON_DOCKER_IMAGE, description="Triton Server Docker image tag", ) ) @@ -878,7 +887,7 @@ def _add_triton_configs(self): "triton_http_endpoint", flags=["--triton-http-endpoint"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_TRITON_HTTP_ENDPOINT, + default_value=config_defaults.DEFAULT_TRITON_HTTP_ENDPOINT, description="Triton Server HTTP endpoint url used by Model Analyzer client.", ) ) @@ -887,7 +896,7 @@ def _add_triton_configs(self): "triton_grpc_endpoint", flags=["--triton-grpc-endpoint"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_TRITON_GRPC_ENDPOINT, + default_value=config_defaults.DEFAULT_TRITON_GRPC_ENDPOINT, description="Triton Server HTTP endpoint url used by Model Analyzer client.", ) ) @@ -896,7 +905,7 @@ def _add_triton_configs(self): "triton_metrics_url", field_type=ConfigPrimitive(str), flags=["--triton-metrics-url"], - default_value=DEFAULT_TRITON_METRICS_URL, + default_value=config_defaults.DEFAULT_TRITON_METRICS_URL, description="Triton Server Metrics endpoint url. ", ) ) @@ -905,7 +914,7 @@ def _add_triton_configs(self): "triton_server_path", field_type=ConfigPrimitive(str), flags=["--triton-server-path"], - default_value=DEFAULT_TRITON_SERVER_PATH, + default_value=config_defaults.DEFAULT_TRITON_SERVER_PATH, description="The full path to the tritonserver binary executable", ) ) @@ -953,7 +962,7 @@ def _add_triton_configs(self): ConfigField( "triton_install_path", field_type=ConfigPrimitive(str), - default_value=DEFAULT_TRITON_INSTALL_PATH, + default_value=config_defaults.DEFAULT_TRITON_INSTALL_PATH, flags=["--triton-install-path"], description=( "Path to Triton install directory i.e. the parent directory of 'lib/libtritonserver.so'." @@ -973,7 +982,7 @@ def _add_perf_analyzer_configs(self): "perf_analyzer_timeout", flags=["--perf-analyzer-timeout"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_PERF_ANALYZER_TIMEOUT, + default_value=config_defaults.DEFAULT_PERF_ANALYZER_TIMEOUT, description="Perf analyzer timeout value in seconds.", ) ) @@ -982,7 +991,8 @@ def _add_perf_analyzer_configs(self): "perf_analyzer_cpu_util", flags=["--perf-analyzer-cpu-util"], field_type=ConfigPrimitive(float), - default_value=psutil.cpu_count() * DEFAULT_PERF_ANALYZER_CPU_UTIL, + default_value=psutil.cpu_count() + * config_defaults.DEFAULT_PERF_ANALYZER_CPU_UTIL, description="Maximum CPU utilization value allowed for the perf_analyzer.", ) ) @@ -991,7 +1001,7 @@ def _add_perf_analyzer_configs(self): "perf_analyzer_path", flags=["--perf-analyzer-path"], field_type=ConfigPrimitive(str, validator=binary_path_validator), - default_value=DEFAULT_PERF_ANALYZER_PATH, + default_value=config_defaults.DEFAULT_PERF_ANALYZER_PATH, description="The full path to the perf_analyzer binary executable", ) ) @@ -1001,7 +1011,7 @@ def _add_perf_analyzer_configs(self): flags=["--perf-output"], parser_args={"action": "store_true"}, field_type=ConfigPrimitive(bool), - default_value=DEFAULT_PERF_OUTPUT_FLAG, + default_value=config_defaults.DEFAULT_PERF_OUTPUT_FLAG, description="Enables the output from the perf_analyzer to a file specified by" " perf_output_path. If perf_output_path is None, output will be" " written to stdout.", @@ -1020,7 +1030,7 @@ def _add_perf_analyzer_configs(self): "perf_analyzer_max_auto_adjusts", flags=["--perf-analyzer-max-auto-adjusts"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_PERF_MAX_AUTO_ADJUSTS, + default_value=config_defaults.DEFAULT_PERF_MAX_AUTO_ADJUSTS, description="Maximum number of times perf_analyzer is " "launched with auto adjusted parameters in an attempt to profile a model. ", ) @@ -1034,7 +1044,7 @@ def _add_export_configs(self): ConfigField( "export_path", flags=["-e", "--export-path"], - default_value=DEFAULT_EXPORT_PATH, + default_value=config_defaults.DEFAULT_EXPORT_PATH, field_type=ConfigPrimitive(str, validator=parent_path_validator), description="Full path to directory in which to store the results", ) @@ -1043,7 +1053,7 @@ def _add_export_configs(self): ConfigField( "filename_model_inference", flags=["--filename-model-inference"], - default_value=DEFAULT_FILENAME_MODEL_INFERENCE, + default_value=config_defaults.DEFAULT_FILENAME_MODEL_INFERENCE, field_type=ConfigPrimitive(str), description="Specifies filename for storing model inference metrics", ) @@ -1053,7 +1063,7 @@ def _add_export_configs(self): "filename_model_gpu", flags=["--filename-model-gpu"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_FILENAME_MODEL_GPU, + default_value=config_defaults.DEFAULT_FILENAME_MODEL_GPU, description="Specifies filename for storing model GPU metrics", ) ) @@ -1062,7 +1072,7 @@ def _add_export_configs(self): "filename_server_only", flags=["--filename-server-only"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_FILENAME_SERVER_ONLY, + default_value=config_defaults.DEFAULT_FILENAME_SERVER_ONLY, description="Specifies filename for server-only metrics", ) ) @@ -1076,7 +1086,7 @@ def _add_report_configs(self): "num_configs_per_model", flags=["--num-configs-per-model"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_NUM_CONFIGS_PER_MODEL, + default_value=config_defaults.DEFAULT_NUM_CONFIGS_PER_MODEL, description="The number of configurations to plot per model in the summary.", ) ) @@ -1085,7 +1095,7 @@ def _add_report_configs(self): "num_top_model_configs", flags=["--num-top-model-configs"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_NUM_TOP_MODEL_CONFIGS, + default_value=config_defaults.DEFAULT_NUM_TOP_MODEL_CONFIGS, description="Model Analyzer will compare this many of the top models configs across all models.", ) ) @@ -1100,7 +1110,7 @@ def _add_table_configs(self): "inference_output_fields", flags=["--inference-output-fields"], field_type=ConfigListString(), - default_value=DEFAULT_INFERENCE_OUTPUT_FIELDS, + default_value=config_defaults.DEFAULT_INFERENCE_OUTPUT_FIELDS, description="Specifies column keys for model inference metrics table", ) ) @@ -1109,7 +1119,7 @@ def _add_table_configs(self): "gpu_output_fields", flags=["--gpu-output-fields"], field_type=ConfigListString(), - default_value=DEFAULT_GPU_OUTPUT_FIELDS, + default_value=config_defaults.DEFAULT_GPU_OUTPUT_FIELDS, description="Specifies column keys for model gpu metrics table", ) ) @@ -1118,7 +1128,7 @@ def _add_table_configs(self): "server_output_fields", flags=["--server-output-fields"], field_type=ConfigListString(), - default_value=DEFAULT_SERVER_OUTPUT_FIELDS, + default_value=config_defaults.DEFAULT_SERVER_OUTPUT_FIELDS, description="Specifies column keys for server-only metrics table", ) ) @@ -1163,7 +1173,9 @@ def set_config_values(self, args: argparse.Namespace) -> None: this exception """ if args.mode == "online" and "latency_budget" not in args: - self._fields["objectives"].set_default_value(DEFAULT_ONLINE_OBJECTIVES) + self._fields["objectives"].set_default_value( + config_defaults.DEFAULT_ONLINE_OBJECTIVES + ) super().set_config_values(args) @@ -1171,9 +1183,9 @@ def set_config_values(self, args: argparse.Namespace) -> None: # able to edit these plots. self._add_plot_configs() if args.mode == "online": - self._fields["plots"].set_value(DEFAULT_ONLINE_PLOTS) + self._fields["plots"].set_value(config_defaults.DEFAULT_ONLINE_PLOTS) elif args.mode == "offline": - self._fields["plots"].set_value(DEFAULT_OFFLINE_PLOTS) + self._fields["plots"].set_value(config_defaults.DEFAULT_OFFLINE_PLOTS) def _add_plot_configs(self): """ @@ -1336,11 +1348,13 @@ def _autofill_values(self): if self._using_request_rate(): if not self._fields["inference_output_fields"].is_set_by_user(): self.inference_output_fields = ( - DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS + config_defaults.DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS ) if not self._fields["gpu_output_fields"].is_set_by_user(): - self.gpu_output_fields = DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS + self.gpu_output_fields = ( + config_defaults.DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS + ) new_profile_models = {} for i, model in enumerate(self.profile_models): @@ -1369,7 +1383,7 @@ def _autofill_values(self): "Weighting can not be specified as a global parameter. Please make this a model parameter." ) else: - new_model["weighting"] = DEFAULT_MODEL_WEIGHTING + new_model["weighting"] = config_defaults.DEFAULT_MODEL_WEIGHTING else: new_model["weighting"] = model.weighting() diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py index 67c62dca9..c2edd6e91 100755 --- a/model_analyzer/config/input/config_defaults.py +++ b/model_analyzer/config/input/config_defaults.py @@ -51,10 +51,15 @@ DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1 DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE = 128 DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5 +DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH = 1 +DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH = 1000 +DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT = 1 +DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT = 256 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False DEFAULT_RUN_CONFIG_SEARCH_MODE = "brute" DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE = False DEFAULT_REQUEST_RATE_SEARCH_ENABLE = False +DEFAULT_LLM_SEARCH_ENABLE = False DEFAULT_TRITON_LAUNCH_MODE = "local" DEFAULT_TRITON_DOCKER_IMAGE = "nvcr.io/nvidia/tritonserver:23.09-py3" DEFAULT_TRITON_HTTP_ENDPOINT = "localhost:8000" diff --git a/tests/test_cli.py b/tests/test_cli.py index 98ec60237..75be15038 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -30,10 +30,10 @@ import psutil +import model_analyzer.config.input.config_defaults as config_defaults from model_analyzer.cli.cli import CLI from model_analyzer.config.input.config_command_profile import ConfigCommandProfile from model_analyzer.config.input.config_command_report import ConfigCommandReport -from model_analyzer.config.input.config_defaults import DEFAULT_TRITON_DOCKER_IMAGE from model_analyzer.config.input.config_status import ConfigStatus from model_analyzer.constants import CONFIG_PARSER_SUCCESS from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException @@ -60,6 +60,7 @@ def get_test_options(): OptionStruct("bool", "profile","--run-config-search-disable"), OptionStruct("bool", "profile","--run-config-profile-models-concurrently-enable"), OptionStruct("bool", "profile","--request-rate-search-enable"), + OptionStruct("bool", "profile","--llm-search-enable"), OptionStruct("bool", "profile","--reload-model-disable"), OptionStruct("bool", "profile","--early-exit-enable"), OptionStruct("bool", "profile","--skip-summary-reports"), @@ -71,23 +72,27 @@ def get_test_options(): # The following options can be None: # short_option # expected_default_value - OptionStruct("int", "profile", "--client-max-retries", "-r", "125", "50"), - OptionStruct("int", "profile", "--duration-seconds", "-d", "10", "3"), - OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", "600"), - OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", "10"), - OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", "1"), - OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", "1024"), - OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", "16"), - OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", "8192"), - OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", "1"), - OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", "128"), - OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", "1"), - OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", "5"), - OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", "5"), - OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", "1.0"), - OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * 80.0)), - OptionStruct("int", "profile", "--num-configs-per-model", None, "10", "3"), - OptionStruct("int", "profile", "--num-top-model-configs", None, "10", "0"), + OptionStruct("int", "profile", "--client-max-retries", "-r", "125", str(config_defaults.DEFAULT_MAX_RETRIES)), + OptionStruct("int", "profile", "--duration-seconds", "-d", "10", str(config_defaults.DEFAULT_DURATION_SECONDS)), + OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", str(config_defaults.DEFAULT_PERF_ANALYZER_TIMEOUT)), + OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", str(config_defaults.DEFAULT_PERF_MAX_AUTO_ADJUSTS)), + OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_CONCURRENCY)), + OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_CONCURRENCY)), + OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE)), + OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE)), + OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE)), + OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE)), + OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT)), + OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT)), + OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS)), + OptionStruct("int", "profile", "--run-config-search-min-prompt-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH)), + OptionStruct("int", "profile", "--run-config-search-max-prompt-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH)), + OptionStruct("int", "profile", "--run-config-search-min-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT)), + OptionStruct("int", "profile", "--run-config-search-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT)), + OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", str(config_defaults.DEFAULT_MONITORING_INTERVAL)), + OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * config_defaults.DEFAULT_PERF_ANALYZER_CPU_UTIL)), + OptionStruct("int", "profile", "--num-configs-per-model", None, "10", str(config_defaults.DEFAULT_NUM_CONFIGS_PER_MODEL)), + OptionStruct("int", "profile", "--num-top-model-configs", None, "10", str(config_defaults.DEFAULT_NUM_TOP_MODEL_CONFIGS)), OptionStruct("int", "profile", "--latency-budget", None, "200", None), OptionStruct("int", "profile", "--min-throughput", None, "300", None), @@ -105,7 +110,7 @@ def get_test_options(): OptionStruct("string", "profile", "--client-protocol", None, ["http", "grpc"], "grpc", "SHOULD_FAIL"), OptionStruct("string", "profile", "--perf-analyzer-path", None, ".", "perf_analyzer", None), OptionStruct("string", "profile", "--perf-output-path", None, ".", None, None), - OptionStruct("string", "profile", "--triton-docker-image", None, "test_image", DEFAULT_TRITON_DOCKER_IMAGE, None), + OptionStruct("string", "profile", "--triton-docker-image", None, "test_image", config_defaults.DEFAULT_TRITON_DOCKER_IMAGE, None), OptionStruct("string", "profile", "--triton-http-endpoint", None, "localhost:4000", "localhost:8000", None), OptionStruct("string", "profile", "--triton-grpc-endpoint", None, "localhost:4001", "localhost:8001", None), OptionStruct("string", "profile", "--triton-metrics-url", None, "localhost:4002", "http://localhost:8002/metrics", None), @@ -135,6 +140,8 @@ def get_test_options(): OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"), OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None), OptionStruct("intlist", "profile", "--request-rate", None, "1, 2, 3", None), + OptionStruct("intlist", "profile", "--prompt-length", None, "1, 2, 3", None), + OptionStruct("intlist", "profile", "--max-token-count", None, "1, 2, 3", None), OptionStruct("stringlist", "profile", "--triton-docker-mounts", None, "a:b:c, d:e:f", None, extra_commands=["--triton-launch-mode", "docker"]), OptionStruct("stringlist", "profile", "--gpus", None, "a, b, c", "all"), OptionStruct("stringlist", "profile", "--inference-output-fields", None, "a, b, c",