Add MVP LLM support to MA (#783)

* Adding new options for LLM (#768) * Update README and versions for 23.09 branch (#761) (#767) * Adding new options for LLM * Fixing codeQL issues * Fixing codeQL issue --------- Co-authored-by: Misha Chornyi <[email protected]> * Add LLM support to Brute Search (#769) * Initial coding complete * First unit test passing * Adding test for prompt length * Refactor PACG methods * Further refactoring * Ensure early exit isn't enabled for LLM models * Fix type checking errors * Attempt at fixing codeql issue * Revert "Attempt at fixing codeql issue" This reverts commit 2619b83. * Attempt at codeQL fix * Adding deepcopy back in * Removing deepcopy in an attempt to fix codeQL errors * Update model_analyzer/config/input/config_command_profile.py Co-authored-by: Hyunjae Woo <[email protected]> * Update model_analyzer/config/generate/perf_analyzer_config_generator.py Co-authored-by: Hyunjae Woo <[email protected]> * Update model_analyzer/config/generate/perf_analyzer_config_generator.py Co-authored-by: Hyunjae Woo <[email protected]> * Update model_analyzer/config/generate/perf_analyzer_config_generator.py Co-authored-by: Hyunjae Woo <[email protected]> * Moving location of method * Changing parameter to inference load * Changing parameter to inference load * Changing prompt length to text input length * Changing max_tokens to use request-parameter * Fix input-data typo * Changing non-parameter to parameter --------- Co-authored-by: Hyunjae Woo <[email protected]> * New LLM record types (#770) * New measurement fields created. * Fixing omission in llm_metric_table * Changing name to be avg_token_to_token... * New config options based on live run (#775) * Added new config options and modified existing options * Refactoring model parameter setting * Removing magic numbers * Capture LLM metrics from PA (#774) * Initial code for aggregation of new LLM metrics * New measurement fields created. * Fixing PA unit tests * Adding hooks in metrics to capture new LLM fields * Fixing codeQL errors * Fixing type checking errors * Changes needed post-merge from other branches * Revert naming mistake (due to merge). * Changes uncovered during live testing * Fixes based on hwoo review * Fixing typo * Change to use lists and mean() * Changes based on hwoo review * Correct how periodic concurrency works in PACG (#777) * Created a new class ConfigRangeNumeric and using it for periodic-concurrency * Fixes and defaults for periodic concurrency * First unit test passing * PACG chagnes complete. Unit tests updated and passing * Removing uneeded class * Fixing codeQL and hwoo's review suggestions * Adding missing else * Llm testing live run (#778) * Created a new class ConfigRangeNumeric and using it for periodic-concurrency * Fixes and defaults for periodic concurrency * First unit test passing * PACG chagnes complete. Unit tests updated and passing * Removing uneeded class * Changes to fix live run * Minor refactor and cleanup * Removing json files * Changing to use f-string * More cleanup from hwoo CR * Removing stale code for request period * Fix nit * Changes to get LLM summary reports working (#779) * Changes to get LLM summary reports working * Addressing hwoo's CR * Adding illegal LLM checks w/ unit testing + some minor cleanup (#781) * Adding illegal LLM checks w/ unit testing + some minor cleanup * Updated with TMA * Misc LLM cleanup (#782) * General cleanup * Add ticket nums to todos * Fix for non-LLM breaking bug introduced. * summary table in progress --------- Co-authored-by: Misha Chornyi <[email protected]> Co-authored-by: Hyunjae Woo <[email protected]>
triton-inference-server · Nov 3, 2023 · f15427e · f15427e
1 parent 32389de
commit f15427e
Show file tree

Hide file tree

Showing 32 changed files with 1,828 additions and 462 deletions.
diff --git a/model_analyzer/analyzer.py b/model_analyzer/analyzer.py
@@ -136,8 +136,14 @@ def profile(
 
         if not self._config.skip_summary_reports:
             self._create_summary_tables(verbose)
-            self._create_summary_reports(mode)
-            self._create_detailed_reports(mode)
+
+            # TODO TMA-1401: need to figure out summary reporting for LLMs
+            if not self._config.is_llm_model():
+                self._create_summary_reports(mode)
+
+            # TODO TMA-1443: need to figure out detailed reporting for LLMs
+            if not self._config.is_llm_model():
+                self._create_detailed_reports(mode)
 
         self._check_for_perf_analyzer_errors()
 

diff --git a/model_analyzer/config/generate/automatic_model_config_generator.py b/model_analyzer/config/generate/automatic_model_config_generator.py
@@ -79,10 +79,7 @@ def __init__(
             logger.info("")
             AutomaticModelConfigGenerator._log_first_run = True
 
-        self._max_instance_count = config.run_config_search_max_instance_count
-        self._min_instance_count = config.run_config_search_min_instance_count
-        self._max_model_batch_size = config.run_config_search_max_model_batch_size
-        self._min_model_batch_size = config.run_config_search_min_model_batch_size
+        self._set_min_max_search_values(config)
 
         self._instance_kind = "KIND_CPU" if self._cpu_only else "KIND_GPU"
 
@@ -91,7 +88,7 @@ def __init__(
 
         self._reset_max_batch_size()
 
-        if not self._early_exit_enable:
+        if not self._early_exit_enable and not self._config.is_llm_model():
             raise TritonModelAnalyzerException(
                 "Early exit disable is not supported in automatic model config generator"
             )
@@ -162,3 +159,9 @@ def _get_curr_param_combo(self) -> Dict:
             config["dynamic_batching"] = {}
 
         return config
+
+    def _set_min_max_search_values(self, config: ConfigCommandProfile) -> None:
+        self._max_instance_count = config.run_config_search_max_instance_count
+        self._min_instance_count = config.run_config_search_min_instance_count
+        self._max_model_batch_size = config.run_config_search_max_model_batch_size
+        self._min_model_batch_size = config.run_config_search_min_model_batch_size
diff --git a/..._parameter_search_run_config_generator.py → ...lus_binary_search_run_config_generator.py b/..._parameter_search_run_config_generator.py → ...lus_binary_search_run_config_generator.py
@@ -29,7 +29,7 @@
 from model_analyzer.config.run.run_config import RunConfig
 from model_analyzer.constants import LOGGER_NAME
 from model_analyzer.device.gpu_device import GPUDevice
-from model_analyzer.result.parameter_search import ParameterSearch
+from model_analyzer.result.inference_load_search import InferenceLoadSearch
 from model_analyzer.result.result_manager import ResultManager
 from model_analyzer.result.run_config_measurement import RunConfigMeasurement
 from model_analyzer.triton.client.client import TritonClient
@@ -39,10 +39,10 @@
 logger = logging.getLogger(LOGGER_NAME)
 
 
-class BrutePlusBinaryParameterSearchRunConfigGenerator(ConfigGeneratorInterface):
+class BrutePlusBinarySearchRunConfigGenerator(ConfigGeneratorInterface):
     """
     First run BruteRunConfigGenerator for a brute search, then for
-    automatic searches use ParameterSearch to perform a binary search
+    automatic searches use InferenceLoadSearch to perform a binary search
     """
 
     def __init__(
@@ -116,7 +116,11 @@ def _create_brute_run_config_generator(self) -> BruteRunConfigGenerator:
 
     def _can_binary_search_top_results(self) -> bool:
         for model in self._models:
-            if model.parameters()["concurrency"] or model.parameters()["request_rate"]:
+            if (
+                model.parameters()["concurrency"]
+                or model.parameters()["request_rate"]
+                or self._config.is_llm_model()
+            ):
                 return False
 
         return True
@@ -132,17 +136,19 @@ def _binary_search_over_top_results(self) -> Generator[RunConfig, None, None]:
             for result in top_results:
                 run_config = deepcopy(result.run_config())
                 model_parameters = self._get_model_parameters(model_name)
-                parameter_search = ParameterSearch(
+                inference_load_search = InferenceLoadSearch(
                     config=self._config,
                     model_parameters=model_parameters,
-                    skip_parameter_sweep=True,
+                    skip_inference_load_sweep=True,
                 )
-                for parameter in parameter_search.search_parameters():
-                    run_config = self._set_parameter(
-                        run_config, model_parameters, parameter
+                for inference_load in inference_load_search.search_inference_loads():
+                    run_config = self._set_inference_load(
+                        run_config, model_parameters, inference_load
                     )
                     yield run_config
-                    parameter_search.add_run_config_measurement(self._last_measurement)
+                    inference_load_search.add_run_config_measurement(
+                        self._last_measurement
+                    )
 
     def _get_model_parameters(self, model_name: str) -> Dict:
         for model in self._models:
@@ -151,14 +157,14 @@ def _get_model_parameters(self, model_name: str) -> Dict:
 
         return {}
 
-    def _set_parameter(
-        self, run_config: RunConfig, model_parameters: Dict, parameter: int
+    def _set_inference_load(
+        self, run_config: RunConfig, model_parameters: Dict, inference_load: int
     ) -> RunConfig:
         for model_run_config in run_config.model_run_configs():
             perf_config = model_run_config.perf_config()
             if self._config.is_request_rate_specified(model_parameters):
-                perf_config.update_config({"request-rate-range": parameter})
+                perf_config.update_config({"request-rate-range": inference_load})
             else:
-                perf_config.update_config({"concurrency-range": parameter})
+                perf_config.update_config({"concurrency-range": inference_load})
 
         return run_config
diff --git a/model_analyzer/config/generate/brute_run_config_generator.py b/model_analyzer/config/generate/brute_run_config_generator.py
@@ -80,7 +80,7 @@ def __init__(
         self._curr_results: List = [[] for n in range(self._num_models)]
         self._curr_generators: Dict[int, ConfigGeneratorInterface] = {}
 
-        self._skip_default_config = skip_default_config
+        self._skip_default_config = skip_default_config or config.is_llm_model()
 
     def set_last_results(
         self, measurements: List[Optional[RunConfigMeasurement]]

diff --git a/model_analyzer/config/generate/generator_utils.py b/model_analyzer/config/generate/generator_utils.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 from itertools import product
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 
 class GeneratorUtils:
@@ -80,8 +80,8 @@ def generate_combinations(value: object) -> List:
     @staticmethod
     def generate_parameter_combinations(params: Dict) -> List[Dict]:
         """
-        Generate a list of all possible subdictionaries
-        from given dictionary. The subdictionaries will
+        Generate a list of all possible sub-dictionaries
+        from given dictionary. The sub-dictionaries will
         have all the same keys, but only one value from
         each key.
 
@@ -108,9 +108,45 @@ def generate_doubled_list(min_value: int, max_value: int) -> List[int]:
             The value that the generated list will not exceed
         """
 
+        assert min_value <= max_value
+
         list = []
         val = 1 if min_value == 0 else min_value
         while val <= max_value:
             list.append(val)
             val *= 2
         return list
+
+    @staticmethod
+    def extract_value_from_request_parameter(request_parameter: Optional[str]) -> int:
+        if not request_parameter:
+            return 0
+
+        # Format is: <parameter>:<value>:<type>
+        # Example: max_tokens:10:int
+        _, value, _ = request_parameter.split(":")
+
+        # this catches the case for non-LLM models where the user has specified request parameters
+        try:
+            int(value)
+        except ValueError as _:
+            return 0
+
+        return int(value)
+
+    @staticmethod
+    def extract_text_input_length_from_input_data(input_data: Optional[str]) -> int:
+        if not input_data:
+            return 0
+
+        # format is input-data-<num>.json
+        _, _, text_input_length = input_data.split("-")
+        text_input_length, _ = text_input_length.split(".")
+
+        # this catches the case for non-LLM models where the user has specified input data
+        try:
+            int(text_input_length)
+        except ValueError as _:
+            return 0
+
+        return int(text_input_length)
diff --git a/model_analyzer/config/generate/model_run_config_generator.py b/model_analyzer/config/generate/model_run_config_generator.py
@@ -150,5 +150,13 @@ def _determine_early_exit_enables(
         concurrency_specified = model.parameters()["concurrency"]
         config_parameters_exist = model.model_config_parameters()
 
-        self._pacg_early_exit_enable = early_exit_enable or not concurrency_specified
-        self._mcg_early_exit_enable = early_exit_enable or not config_parameters_exist
+        if config.is_llm_model():
+            self._pacg_early_exit_enable = False
+            self._mcg_early_exit_enable = False
+        else:
+            self._pacg_early_exit_enable = (
+                early_exit_enable or not concurrency_specified
+            )
+            self._mcg_early_exit_enable = (
+                early_exit_enable or not config_parameters_exist
+            )