From c9d467f74717fd800b0ad6b60be0220bf9bcd74b Mon Sep 17 00:00:00 2001
From: Brian Raf <92820864+nv-braf@users.noreply.github.com>
Date: Tue, 3 Oct 2023 08:05:04 -0700
Subject: [PATCH] Adding new options for LLM (#768)

* Update README and versions for 23.09 branch (#761) (#767)

* Adding new options for LLM

* Fixing codeQL issues

* Fixing codeQL issue

---------

Co-authored-by: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
---
 .../config/input/config_command_profile.py    | 234 ++++++++++--------
 .../config/input/config_defaults.py           |   5 +
 tests/test_cli.py                             |  45 ++--
 3 files changed, 155 insertions(+), 129 deletions(-)

diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py
index 02d6def28..9c40f16ef 100755
--- a/model_analyzer/config/input/config_command_profile.py
+++ b/model_analyzer/config/input/config_command_profile.py
@@ -23,6 +23,7 @@
 from google.protobuf.descriptor import FieldDescriptor
 from tritonclient.grpc.model_config_pb2 import ModelConfig
 
+import model_analyzer.config.input.config_defaults as config_defaults
 from model_analyzer.config.input.config_utils import (
     binary_path_validator,
     file_path_validator,
@@ -36,62 +37,6 @@
 from model_analyzer.triton.server.server_config import TritonServerConfig
 
 from .config_command import ConfigCommand
-from .config_defaults import (
-    DEFAULT_ALWAYS_REPORT_GPU_METRICS,
-    DEFAULT_BATCH_SIZES,
-    DEFAULT_CHECKPOINT_DIRECTORY,
-    DEFAULT_CLIENT_PROTOCOL,
-    DEFAULT_COLLECT_CPU_METRICS,
-    DEFAULT_DURATION_SECONDS,
-    DEFAULT_EXPORT_PATH,
-    DEFAULT_FILENAME_MODEL_GPU,
-    DEFAULT_FILENAME_MODEL_INFERENCE,
-    DEFAULT_FILENAME_SERVER_ONLY,
-    DEFAULT_GPU_OUTPUT_FIELDS,
-    DEFAULT_GPUS,
-    DEFAULT_INFERENCE_OUTPUT_FIELDS,
-    DEFAULT_MAX_RETRIES,
-    DEFAULT_MODEL_WEIGHTING,
-    DEFAULT_MONITORING_INTERVAL,
-    DEFAULT_NUM_CONFIGS_PER_MODEL,
-    DEFAULT_NUM_TOP_MODEL_CONFIGS,
-    DEFAULT_OFFLINE_OBJECTIVES,
-    DEFAULT_OFFLINE_PLOTS,
-    DEFAULT_ONLINE_OBJECTIVES,
-    DEFAULT_ONLINE_PLOTS,
-    DEFAULT_OUTPUT_MODEL_REPOSITORY,
-    DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG,
-    DEFAULT_PERF_ANALYZER_CPU_UTIL,
-    DEFAULT_PERF_ANALYZER_PATH,
-    DEFAULT_PERF_ANALYZER_TIMEOUT,
-    DEFAULT_PERF_MAX_AUTO_ADJUSTS,
-    DEFAULT_PERF_OUTPUT_FLAG,
-    DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS,
-    DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS,
-    DEFAULT_REQUEST_RATE_SEARCH_ENABLE,
-    DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS,
-    DEFAULT_RUN_CONFIG_MAX_CONCURRENCY,
-    DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT,
-    DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE,
-    DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE,
-    DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
-    DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT,
-    DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE,
-    DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
-    DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE,
-    DEFAULT_RUN_CONFIG_SEARCH_DISABLE,
-    DEFAULT_RUN_CONFIG_SEARCH_MODE,
-    DEFAULT_SERVER_OUTPUT_FIELDS,
-    DEFAULT_SKIP_DETAILED_REPORTS,
-    DEFAULT_SKIP_SUMMARY_REPORTS,
-    DEFAULT_TRITON_DOCKER_IMAGE,
-    DEFAULT_TRITON_GRPC_ENDPOINT,
-    DEFAULT_TRITON_HTTP_ENDPOINT,
-    DEFAULT_TRITON_INSTALL_PATH,
-    DEFAULT_TRITON_LAUNCH_MODE,
-    DEFAULT_TRITON_METRICS_URL,
-    DEFAULT_TRITON_SERVER_PATH,
-)
 from .config_enum import ConfigEnum
 from .config_field import ConfigField
 from .config_list_generic import ConfigListGeneric
@@ -224,7 +169,7 @@ def _fill_config(self):
             ConfigField(
                 "checkpoint_directory",
                 flags=["-s", "--checkpoint-directory"],
-                default_value=DEFAULT_CHECKPOINT_DIRECTORY,
+                default_value=config_defaults.DEFAULT_CHECKPOINT_DIRECTORY,
                 field_type=ConfigPrimitive(str, validator=parent_path_validator),
                 description="Full path to directory to which to read and write checkpoints and profile data.",
             )
@@ -234,7 +179,7 @@ def _fill_config(self):
                 "monitoring_interval",
                 flags=["-i", "--monitoring-interval"],
                 field_type=ConfigPrimitive(float),
-                default_value=DEFAULT_MONITORING_INTERVAL,
+                default_value=config_defaults.DEFAULT_MONITORING_INTERVAL,
                 description="Interval of time between metrics measurements in seconds",
             )
         )
@@ -243,7 +188,7 @@ def _fill_config(self):
                 "duration_seconds",
                 field_type=ConfigPrimitive(int),
                 flags=["-d", "--duration-seconds"],
-                default_value=DEFAULT_DURATION_SECONDS,
+                default_value=config_defaults.DEFAULT_DURATION_SECONDS,
                 description="Specifies how long (seconds) to gather server-only metrics",
             )
         )
@@ -253,7 +198,7 @@ def _fill_config(self):
                 field_type=ConfigPrimitive(bool),
                 flags=["--collect-cpu-metrics"],
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_COLLECT_CPU_METRICS,
+                default_value=config_defaults.DEFAULT_COLLECT_CPU_METRICS,
                 description="Specify whether CPU metrics are collected or not",
             )
         )
@@ -262,7 +207,7 @@ def _fill_config(self):
                 "gpus",
                 flags=["--gpus"],
                 field_type=ConfigListString(),
-                default_value=DEFAULT_GPUS,
+                default_value=config_defaults.DEFAULT_GPUS,
                 description="List of GPU UUIDs to be used for the profiling. "
                 "Use 'all' to profile all the GPUs visible by CUDA.",
             )
@@ -273,7 +218,7 @@ def _fill_config(self):
                 flags=["--always-report-gpu-metrics"],
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_ALWAYS_REPORT_GPU_METRICS,
+                default_value=config_defaults.DEFAULT_ALWAYS_REPORT_GPU_METRICS,
                 description="Report GPU metrics, even when the model is `cpu_only`.",
             )
         )
@@ -283,7 +228,7 @@ def _fill_config(self):
                 flags=["--skip-summary-reports"],
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_SKIP_SUMMARY_REPORTS,
+                default_value=config_defaults.DEFAULT_SKIP_SUMMARY_REPORTS,
                 description="Skips the generation of analysis summary reports and tables.",
             )
         )
@@ -293,7 +238,7 @@ def _fill_config(self):
                 flags=["--skip-detailed-reports"],
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_SKIP_DETAILED_REPORTS,
+                default_value=config_defaults.DEFAULT_SKIP_DETAILED_REPORTS,
                 description="Skips the generation of detailed summary reports and tables.",
             )
         )
@@ -325,7 +270,7 @@ def _add_repository_configs(self):
             ConfigField(
                 "output_model_repository_path",
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_OUTPUT_MODEL_REPOSITORY,
+                default_value=config_defaults.DEFAULT_OUTPUT_MODEL_REPOSITORY,
                 flags=["--output-model-repository-path"],
                 description="Output model repository path used by Model Analyzer."
                 " This is the directory that will contain all the generated model configurations",
@@ -336,7 +281,7 @@ def _add_repository_configs(self):
                 "override_output_model_repository",
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG,
+                default_value=config_defaults.DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG,
                 flags=["--override-output-model-repository"],
                 description="Will override the contents of the output model repository"
                 " and replace it with the new results.",
@@ -520,7 +465,7 @@ def _add_profile_models_configs(self):
             ConfigField(
                 "objectives",
                 field_type=objectives_scheme,
-                default_value=DEFAULT_OFFLINE_OBJECTIVES,
+                default_value=config_defaults.DEFAULT_OFFLINE_OBJECTIVES,
                 description="Model Analyzer uses the objectives described here to find the best configuration for each model.",
             )
         )
@@ -602,7 +547,7 @@ def _add_profile_models_configs(self):
                 "batch_sizes",
                 flags=["-b", "--batch-sizes"],
                 field_type=ConfigListNumeric(int),
-                default_value=DEFAULT_BATCH_SIZES,
+                default_value=config_defaults.DEFAULT_BATCH_SIZES,
                 description="Comma-delimited list of batch sizes to use for the profiling",
             )
         )
@@ -624,6 +569,24 @@ def _add_profile_models_configs(self):
                 " to be used during profiling",
             )
         )
+        self._add_config(
+            ConfigField(
+                "prompt_length",
+                flags=["--prompt-length"],
+                field_type=ConfigListNumeric(int),
+                description="Comma-delimited list of prompt length values or ranges <start:end:step>"
+                " to be used during profiling LLMs",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "max_token_count",
+                flags=["--max-token-count"],
+                field_type=ConfigListNumeric(int),
+                description="Comma-delimited list of max token values or ranges <start:end:step>"
+                " to be used during profiling LLMs",
+            )
+        )
         self._add_config(
             ConfigField(
                 "reload_model_disable",
@@ -685,7 +648,7 @@ def _add_client_configs(self):
                 "client_max_retries",
                 flags=["-r", "--client-max-retries"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_MAX_RETRIES,
+                default_value=config_defaults.DEFAULT_MAX_RETRIES,
                 description="Specifies the max number of retries for any requests to Triton server.",
             )
         )
@@ -695,7 +658,7 @@ def _add_client_configs(self):
                 flags=["--client-protocol"],
                 choices=["http", "grpc"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_CLIENT_PROTOCOL,
+                default_value=config_defaults.DEFAULT_CLIENT_PROTOCOL,
                 description="The protocol used to communicate with the Triton Inference Server",
             )
         )
@@ -721,7 +684,7 @@ def _add_run_search_configs(self):
                 "run_config_search_max_concurrency",
                 flags=["--run-config-search-max-concurrency"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MAX_CONCURRENCY,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_CONCURRENCY,
                 description="Max concurrency value that run config search should not go beyond that.",
             )
         )
@@ -730,7 +693,7 @@ def _add_run_search_configs(self):
                 "run_config_search_min_concurrency",
                 flags=["--run-config-search-min-concurrency"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
                 description="Min concurrency value that run config search should start with.",
             )
         )
@@ -739,7 +702,7 @@ def _add_run_search_configs(self):
                 "run_config_search_max_request_rate",
                 flags=["--run-config-search-max-request-rate"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE,
                 description="Max request rate value that run config search should not go beyond that.",
             )
         )
@@ -748,7 +711,7 @@ def _add_run_search_configs(self):
                 "run_config_search_min_request_rate",
                 flags=["--run-config-search-min-request-rate"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
                 description="Min request rate value that run config search should start with.",
             )
         )
@@ -757,7 +720,7 @@ def _add_run_search_configs(self):
                 "run_config_search_max_instance_count",
                 flags=["--run-config-search-max-instance-count"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT,
                 description="Max instance count value that run config search should not go beyond that.",
             )
         )
@@ -766,7 +729,7 @@ def _add_run_search_configs(self):
                 "run_config_search_min_instance_count",
                 flags=["--run-config-search-min-instance-count"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT,
                 description="Min instance count value that run config search should start with.",
             )
         )
@@ -775,7 +738,7 @@ def _add_run_search_configs(self):
                 "run_config_search_max_model_batch_size",
                 flags=["--run-config-search-max-model-batch-size"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE,
                 description="Value for the model's max_batch_size that run config search will not go beyond.",
             )
         )
@@ -784,7 +747,7 @@ def _add_run_search_configs(self):
                 "run_config_search_min_model_batch_size",
                 flags=["--run-config-search-min-model-batch-size"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE,
                 description="Value for the model's max_batch_size that run config search will start from.",
             )
         )
@@ -793,7 +756,7 @@ def _add_run_search_configs(self):
                 "run_config_search_max_binary_search_steps",
                 flags=["--run-config-search-max-binary-search-steps"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS,
                 description="Maximum number of steps take during the binary concurrency search.",
             )
         )
@@ -803,7 +766,7 @@ def _add_run_search_configs(self):
                 flags=["--run-config-search-mode"],
                 choices=["brute", "quick"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_RUN_CONFIG_SEARCH_MODE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_SEARCH_MODE,
                 description="The search mode for Model Analyzer to find and evaluate"
                 " model configurations. 'brute' will brute force all combinations of"
                 " configuration options.  'quick' will attempt to find a near-optimal"
@@ -817,7 +780,7 @@ def _add_run_search_configs(self):
                 flags=["--run-config-search-disable"],
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_RUN_CONFIG_SEARCH_DISABLE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_SEARCH_DISABLE,
                 description="Disable run config search.",
             )
         )
@@ -827,7 +790,7 @@ def _add_run_search_configs(self):
                 flags=["--run-config-profile-models-concurrently-enable"],
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE,
                 description="Enable the profiling of all supplied models concurrently.",
             )
         )
@@ -837,10 +800,56 @@ def _add_run_search_configs(self):
                 flags=["--request-rate-search-enable"],
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_REQUEST_RATE_SEARCH_ENABLE,
+                default_value=config_defaults.DEFAULT_REQUEST_RATE_SEARCH_ENABLE,
                 description="Enables the searching of request rate (instead of concurrency).",
             )
         )
+        self._add_config(
+            ConfigField(
+                "llm_search_enable",
+                flags=["--llm-search-enable"],
+                field_type=ConfigPrimitive(bool),
+                parser_args={"action": "store_true"},
+                default_value=config_defaults.DEFAULT_LLM_SEARCH_ENABLE,
+                description="Enables searching values are important to LLMs: prompt length, max token, etc...",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_min_prompt_length",
+                flags=["--run-config-search-min-prompt-length"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH,
+                description="Min prompt length that run config search should start with.",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_max_prompt_length",
+                flags=["--run-config-search-max-prompt-length"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH,
+                description="Max prompt length that run config search will not go beyond.",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_min_token_count",
+                flags=["--run-config-search-min-token-count"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT,
+                description="Min token count that run config search should start with.",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_max_token_count",
+                flags=["--run-config-search-max-token-count"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT,
+                description="Max token count that run config search will not go beyond.",
+            )
+        )
 
     def _add_triton_configs(self):
         """
@@ -853,7 +862,7 @@ def _add_triton_configs(self):
                 "triton_launch_mode",
                 field_type=ConfigPrimitive(str),
                 flags=["--triton-launch-mode"],
-                default_value=DEFAULT_TRITON_LAUNCH_MODE,
+                default_value=config_defaults.DEFAULT_TRITON_LAUNCH_MODE,
                 choices=["local", "docker", "remote", "c_api"],
                 description="The method by which to launch Triton Server. "
                 "'local' assumes tritonserver binary is available locally. "
@@ -869,7 +878,7 @@ def _add_triton_configs(self):
                 "triton_docker_image",
                 flags=["--triton-docker-image"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_TRITON_DOCKER_IMAGE,
+                default_value=config_defaults.DEFAULT_TRITON_DOCKER_IMAGE,
                 description="Triton Server Docker image tag",
             )
         )
@@ -878,7 +887,7 @@ def _add_triton_configs(self):
                 "triton_http_endpoint",
                 flags=["--triton-http-endpoint"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_TRITON_HTTP_ENDPOINT,
+                default_value=config_defaults.DEFAULT_TRITON_HTTP_ENDPOINT,
                 description="Triton Server HTTP endpoint url used by Model Analyzer client.",
             )
         )
@@ -887,7 +896,7 @@ def _add_triton_configs(self):
                 "triton_grpc_endpoint",
                 flags=["--triton-grpc-endpoint"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_TRITON_GRPC_ENDPOINT,
+                default_value=config_defaults.DEFAULT_TRITON_GRPC_ENDPOINT,
                 description="Triton Server HTTP endpoint url used by Model Analyzer client.",
             )
         )
@@ -896,7 +905,7 @@ def _add_triton_configs(self):
                 "triton_metrics_url",
                 field_type=ConfigPrimitive(str),
                 flags=["--triton-metrics-url"],
-                default_value=DEFAULT_TRITON_METRICS_URL,
+                default_value=config_defaults.DEFAULT_TRITON_METRICS_URL,
                 description="Triton Server Metrics endpoint url. ",
             )
         )
@@ -905,7 +914,7 @@ def _add_triton_configs(self):
                 "triton_server_path",
                 field_type=ConfigPrimitive(str),
                 flags=["--triton-server-path"],
-                default_value=DEFAULT_TRITON_SERVER_PATH,
+                default_value=config_defaults.DEFAULT_TRITON_SERVER_PATH,
                 description="The full path to the tritonserver binary executable",
             )
         )
@@ -953,7 +962,7 @@ def _add_triton_configs(self):
             ConfigField(
                 "triton_install_path",
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_TRITON_INSTALL_PATH,
+                default_value=config_defaults.DEFAULT_TRITON_INSTALL_PATH,
                 flags=["--triton-install-path"],
                 description=(
                     "Path to Triton install directory i.e. the parent directory of 'lib/libtritonserver.so'."
@@ -973,7 +982,7 @@ def _add_perf_analyzer_configs(self):
                 "perf_analyzer_timeout",
                 flags=["--perf-analyzer-timeout"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_PERF_ANALYZER_TIMEOUT,
+                default_value=config_defaults.DEFAULT_PERF_ANALYZER_TIMEOUT,
                 description="Perf analyzer timeout value in seconds.",
             )
         )
@@ -982,7 +991,8 @@ def _add_perf_analyzer_configs(self):
                 "perf_analyzer_cpu_util",
                 flags=["--perf-analyzer-cpu-util"],
                 field_type=ConfigPrimitive(float),
-                default_value=psutil.cpu_count() * DEFAULT_PERF_ANALYZER_CPU_UTIL,
+                default_value=psutil.cpu_count()
+                * config_defaults.DEFAULT_PERF_ANALYZER_CPU_UTIL,
                 description="Maximum CPU utilization value allowed for the perf_analyzer.",
             )
         )
@@ -991,7 +1001,7 @@ def _add_perf_analyzer_configs(self):
                 "perf_analyzer_path",
                 flags=["--perf-analyzer-path"],
                 field_type=ConfigPrimitive(str, validator=binary_path_validator),
-                default_value=DEFAULT_PERF_ANALYZER_PATH,
+                default_value=config_defaults.DEFAULT_PERF_ANALYZER_PATH,
                 description="The full path to the perf_analyzer binary executable",
             )
         )
@@ -1001,7 +1011,7 @@ def _add_perf_analyzer_configs(self):
                 flags=["--perf-output"],
                 parser_args={"action": "store_true"},
                 field_type=ConfigPrimitive(bool),
-                default_value=DEFAULT_PERF_OUTPUT_FLAG,
+                default_value=config_defaults.DEFAULT_PERF_OUTPUT_FLAG,
                 description="Enables the output from the perf_analyzer to a file specified by"
                 " perf_output_path. If perf_output_path is None, output will be"
                 " written to stdout.",
@@ -1020,7 +1030,7 @@ def _add_perf_analyzer_configs(self):
                 "perf_analyzer_max_auto_adjusts",
                 flags=["--perf-analyzer-max-auto-adjusts"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_PERF_MAX_AUTO_ADJUSTS,
+                default_value=config_defaults.DEFAULT_PERF_MAX_AUTO_ADJUSTS,
                 description="Maximum number of times perf_analyzer is "
                 "launched with auto adjusted parameters in an attempt to profile a model. ",
             )
@@ -1034,7 +1044,7 @@ def _add_export_configs(self):
             ConfigField(
                 "export_path",
                 flags=["-e", "--export-path"],
-                default_value=DEFAULT_EXPORT_PATH,
+                default_value=config_defaults.DEFAULT_EXPORT_PATH,
                 field_type=ConfigPrimitive(str, validator=parent_path_validator),
                 description="Full path to directory in which to store the results",
             )
@@ -1043,7 +1053,7 @@ def _add_export_configs(self):
             ConfigField(
                 "filename_model_inference",
                 flags=["--filename-model-inference"],
-                default_value=DEFAULT_FILENAME_MODEL_INFERENCE,
+                default_value=config_defaults.DEFAULT_FILENAME_MODEL_INFERENCE,
                 field_type=ConfigPrimitive(str),
                 description="Specifies filename for storing model inference metrics",
             )
@@ -1053,7 +1063,7 @@ def _add_export_configs(self):
                 "filename_model_gpu",
                 flags=["--filename-model-gpu"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_FILENAME_MODEL_GPU,
+                default_value=config_defaults.DEFAULT_FILENAME_MODEL_GPU,
                 description="Specifies filename for storing model GPU metrics",
             )
         )
@@ -1062,7 +1072,7 @@ def _add_export_configs(self):
                 "filename_server_only",
                 flags=["--filename-server-only"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_FILENAME_SERVER_ONLY,
+                default_value=config_defaults.DEFAULT_FILENAME_SERVER_ONLY,
                 description="Specifies filename for server-only metrics",
             )
         )
@@ -1076,7 +1086,7 @@ def _add_report_configs(self):
                 "num_configs_per_model",
                 flags=["--num-configs-per-model"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_NUM_CONFIGS_PER_MODEL,
+                default_value=config_defaults.DEFAULT_NUM_CONFIGS_PER_MODEL,
                 description="The number of configurations to plot per model in the summary.",
             )
         )
@@ -1085,7 +1095,7 @@ def _add_report_configs(self):
                 "num_top_model_configs",
                 flags=["--num-top-model-configs"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_NUM_TOP_MODEL_CONFIGS,
+                default_value=config_defaults.DEFAULT_NUM_TOP_MODEL_CONFIGS,
                 description="Model Analyzer will compare this many of the top models configs across all models.",
             )
         )
@@ -1100,7 +1110,7 @@ def _add_table_configs(self):
                 "inference_output_fields",
                 flags=["--inference-output-fields"],
                 field_type=ConfigListString(),
-                default_value=DEFAULT_INFERENCE_OUTPUT_FIELDS,
+                default_value=config_defaults.DEFAULT_INFERENCE_OUTPUT_FIELDS,
                 description="Specifies column keys for model inference metrics table",
             )
         )
@@ -1109,7 +1119,7 @@ def _add_table_configs(self):
                 "gpu_output_fields",
                 flags=["--gpu-output-fields"],
                 field_type=ConfigListString(),
-                default_value=DEFAULT_GPU_OUTPUT_FIELDS,
+                default_value=config_defaults.DEFAULT_GPU_OUTPUT_FIELDS,
                 description="Specifies column keys for model gpu metrics table",
             )
         )
@@ -1118,7 +1128,7 @@ def _add_table_configs(self):
                 "server_output_fields",
                 flags=["--server-output-fields"],
                 field_type=ConfigListString(),
-                default_value=DEFAULT_SERVER_OUTPUT_FIELDS,
+                default_value=config_defaults.DEFAULT_SERVER_OUTPUT_FIELDS,
                 description="Specifies column keys for server-only metrics table",
             )
         )
@@ -1163,7 +1173,9 @@ def set_config_values(self, args: argparse.Namespace) -> None:
             this exception
         """
         if args.mode == "online" and "latency_budget" not in args:
-            self._fields["objectives"].set_default_value(DEFAULT_ONLINE_OBJECTIVES)
+            self._fields["objectives"].set_default_value(
+                config_defaults.DEFAULT_ONLINE_OBJECTIVES
+            )
 
         super().set_config_values(args)
 
@@ -1171,9 +1183,9 @@ def set_config_values(self, args: argparse.Namespace) -> None:
         # able to edit these plots.
         self._add_plot_configs()
         if args.mode == "online":
-            self._fields["plots"].set_value(DEFAULT_ONLINE_PLOTS)
+            self._fields["plots"].set_value(config_defaults.DEFAULT_ONLINE_PLOTS)
         elif args.mode == "offline":
-            self._fields["plots"].set_value(DEFAULT_OFFLINE_PLOTS)
+            self._fields["plots"].set_value(config_defaults.DEFAULT_OFFLINE_PLOTS)
 
     def _add_plot_configs(self):
         """
@@ -1336,11 +1348,13 @@ def _autofill_values(self):
         if self._using_request_rate():
             if not self._fields["inference_output_fields"].is_set_by_user():
                 self.inference_output_fields = (
-                    DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS
+                    config_defaults.DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS
                 )
 
             if not self._fields["gpu_output_fields"].is_set_by_user():
-                self.gpu_output_fields = DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS
+                self.gpu_output_fields = (
+                    config_defaults.DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS
+                )
 
         new_profile_models = {}
         for i, model in enumerate(self.profile_models):
@@ -1369,7 +1383,7 @@ def _autofill_values(self):
                         "Weighting can not be specified as a global parameter. Please make this a model parameter."
                     )
                 else:
-                    new_model["weighting"] = DEFAULT_MODEL_WEIGHTING
+                    new_model["weighting"] = config_defaults.DEFAULT_MODEL_WEIGHTING
             else:
                 new_model["weighting"] = model.weighting()
 
diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py
index 67c62dca9..c2edd6e91 100755
--- a/model_analyzer/config/input/config_defaults.py
+++ b/model_analyzer/config/input/config_defaults.py
@@ -51,10 +51,15 @@
 DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1
 DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE = 128
 DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5
+DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH = 1
+DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH = 1000
+DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT = 1
+DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT = 256
 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False
 DEFAULT_RUN_CONFIG_SEARCH_MODE = "brute"
 DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE = False
 DEFAULT_REQUEST_RATE_SEARCH_ENABLE = False
+DEFAULT_LLM_SEARCH_ENABLE = False
 DEFAULT_TRITON_LAUNCH_MODE = "local"
 DEFAULT_TRITON_DOCKER_IMAGE = "nvcr.io/nvidia/tritonserver:23.09-py3"
 DEFAULT_TRITON_HTTP_ENDPOINT = "localhost:8000"
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 98ec60237..75be15038 100755
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -30,10 +30,10 @@
 
 import psutil
 
+import model_analyzer.config.input.config_defaults as config_defaults
 from model_analyzer.cli.cli import CLI
 from model_analyzer.config.input.config_command_profile import ConfigCommandProfile
 from model_analyzer.config.input.config_command_report import ConfigCommandReport
-from model_analyzer.config.input.config_defaults import DEFAULT_TRITON_DOCKER_IMAGE
 from model_analyzer.config.input.config_status import ConfigStatus
 from model_analyzer.constants import CONFIG_PARSER_SUCCESS
 from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
@@ -60,6 +60,7 @@ def get_test_options():
         OptionStruct("bool", "profile","--run-config-search-disable"),
         OptionStruct("bool", "profile","--run-config-profile-models-concurrently-enable"),
         OptionStruct("bool", "profile","--request-rate-search-enable"),
+        OptionStruct("bool", "profile","--llm-search-enable"),
         OptionStruct("bool", "profile","--reload-model-disable"),
         OptionStruct("bool", "profile","--early-exit-enable"),
         OptionStruct("bool", "profile","--skip-summary-reports"),
@@ -71,23 +72,27 @@ def get_test_options():
         # The following options can be None:
         #   short_option
         #   expected_default_value
-        OptionStruct("int", "profile", "--client-max-retries", "-r", "125", "50"),
-        OptionStruct("int", "profile", "--duration-seconds", "-d", "10", "3"),
-        OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", "600"),
-        OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", "10"),
-        OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", "1"),
-        OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", "1024"),
-        OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", "16"),
-        OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", "8192"),
-        OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", "1"),
-        OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", "128"),
-        OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", "1"),
-        OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", "5"),
-        OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", "5"),
-        OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", "1.0"),
-        OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * 80.0)),
-        OptionStruct("int", "profile", "--num-configs-per-model", None, "10", "3"),
-        OptionStruct("int", "profile", "--num-top-model-configs", None, "10", "0"),
+        OptionStruct("int", "profile", "--client-max-retries", "-r", "125", str(config_defaults.DEFAULT_MAX_RETRIES)),
+        OptionStruct("int", "profile", "--duration-seconds", "-d", "10", str(config_defaults.DEFAULT_DURATION_SECONDS)),
+        OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", str(config_defaults.DEFAULT_PERF_ANALYZER_TIMEOUT)),
+        OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", str(config_defaults.DEFAULT_PERF_MAX_AUTO_ADJUSTS)),
+        OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_CONCURRENCY)),
+        OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_CONCURRENCY)),
+        OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE)),
+        OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE)),
+        OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE)),
+        OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE)),
+        OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT)),
+        OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT)),
+        OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS)),
+        OptionStruct("int", "profile", "--run-config-search-min-prompt-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH)),
+        OptionStruct("int", "profile", "--run-config-search-max-prompt-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH)),
+        OptionStruct("int", "profile", "--run-config-search-min-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT)),
+        OptionStruct("int", "profile", "--run-config-search-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT)),
+        OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", str(config_defaults.DEFAULT_MONITORING_INTERVAL)),
+        OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * config_defaults.DEFAULT_PERF_ANALYZER_CPU_UTIL)),
+        OptionStruct("int", "profile", "--num-configs-per-model", None, "10", str(config_defaults.DEFAULT_NUM_CONFIGS_PER_MODEL)),
+        OptionStruct("int", "profile", "--num-top-model-configs", None, "10", str(config_defaults.DEFAULT_NUM_TOP_MODEL_CONFIGS)),
         OptionStruct("int", "profile", "--latency-budget", None, "200", None),
         OptionStruct("int", "profile", "--min-throughput", None, "300", None),
 
@@ -105,7 +110,7 @@ def get_test_options():
         OptionStruct("string", "profile", "--client-protocol", None, ["http", "grpc"], "grpc", "SHOULD_FAIL"),
         OptionStruct("string", "profile", "--perf-analyzer-path", None, ".", "perf_analyzer", None),
         OptionStruct("string", "profile", "--perf-output-path", None, ".", None, None),
-        OptionStruct("string", "profile", "--triton-docker-image", None, "test_image", DEFAULT_TRITON_DOCKER_IMAGE, None),
+        OptionStruct("string", "profile", "--triton-docker-image", None, "test_image", config_defaults.DEFAULT_TRITON_DOCKER_IMAGE, None),
         OptionStruct("string", "profile", "--triton-http-endpoint", None, "localhost:4000", "localhost:8000", None),
         OptionStruct("string", "profile", "--triton-grpc-endpoint", None, "localhost:4001", "localhost:8001", None),
         OptionStruct("string", "profile", "--triton-metrics-url", None, "localhost:4002", "http://localhost:8002/metrics", None),
@@ -135,6 +140,8 @@ def get_test_options():
         OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"),
         OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None),
         OptionStruct("intlist", "profile", "--request-rate", None, "1, 2, 3", None),
+        OptionStruct("intlist", "profile", "--prompt-length", None, "1, 2, 3", None),
+        OptionStruct("intlist", "profile", "--max-token-count", None, "1, 2, 3", None),
         OptionStruct("stringlist", "profile", "--triton-docker-mounts", None, "a:b:c, d:e:f", None, extra_commands=["--triton-launch-mode", "docker"]),
         OptionStruct("stringlist", "profile", "--gpus", None, "a, b, c", "all"),
         OptionStruct("stringlist", "profile", "--inference-output-fields", None, "a, b, c",