Skip to content

Commit

Permalink
Add MVP LLM support to MA (#783)
Browse files Browse the repository at this point in the history
* Adding new options for LLM (#768)

* Update README and versions for 23.09 branch (#761) (#767)

* Adding new options for LLM

* Fixing codeQL issues

* Fixing codeQL issue

---------

Co-authored-by: Misha Chornyi <[email protected]>

* Add LLM support to Brute Search (#769)

* Initial coding complete

* First unit test passing

* Adding test for prompt length

* Refactor PACG methods

* Further refactoring

* Ensure early exit isn't enabled for LLM models

* Fix type checking errors

* Attempt at fixing codeql issue

* Revert "Attempt at fixing codeql issue"

This reverts commit 2619b83.

* Attempt at codeQL fix

* Adding deepcopy back in

* Removing deepcopy in an attempt to fix codeQL errors

* Update model_analyzer/config/input/config_command_profile.py

Co-authored-by: Hyunjae Woo <[email protected]>

* Update model_analyzer/config/generate/perf_analyzer_config_generator.py

Co-authored-by: Hyunjae Woo <[email protected]>

* Update model_analyzer/config/generate/perf_analyzer_config_generator.py

Co-authored-by: Hyunjae Woo <[email protected]>

* Update model_analyzer/config/generate/perf_analyzer_config_generator.py

Co-authored-by: Hyunjae Woo <[email protected]>

* Moving location of method

* Changing parameter to inference load

* Changing parameter to inference load

* Changing prompt length to text input length

* Changing max_tokens to use request-parameter

* Fix input-data typo

* Changing non-parameter to parameter

---------

Co-authored-by: Hyunjae Woo <[email protected]>

* New LLM record types (#770)

* New measurement fields created.

* Fixing omission in llm_metric_table

* Changing name to be avg_token_to_token...

* New config options based on live run (#775)

* Added new config options and modified existing options

* Refactoring model parameter setting

* Removing magic numbers

* Capture LLM metrics from PA (#774)

* Initial code for aggregation of new LLM metrics

* New measurement fields created.

* Fixing PA unit tests

* Adding hooks in metrics to capture new LLM fields

* Fixing codeQL errors

* Fixing type checking errors

* Changes needed post-merge from other branches

* Revert naming mistake (due to merge).

* Changes uncovered during live testing

* Fixes based on hwoo review

* Fixing typo

* Change to use lists and mean()

* Changes based on hwoo review

* Correct how periodic concurrency works in PACG (#777)

* Created a new class ConfigRangeNumeric and using it for periodic-concurrency

* Fixes and defaults for periodic concurrency

* First unit test passing

* PACG chagnes complete. Unit tests updated and passing

* Removing uneeded class

* Fixing codeQL and hwoo's review suggestions

* Adding missing else

* Llm testing live run (#778)

* Created a new class ConfigRangeNumeric and using it for periodic-concurrency

* Fixes and defaults for periodic concurrency

* First unit test passing

* PACG chagnes complete. Unit tests updated and passing

* Removing uneeded class

* Changes to fix live run

* Minor refactor and cleanup

* Removing json files

* Changing to use f-string

* More cleanup from hwoo CR

* Removing stale code for request period

* Fix nit

* Changes to get LLM summary reports working (#779)

* Changes to get LLM summary reports working

* Addressing hwoo's CR

* Adding illegal LLM checks w/ unit testing + some minor cleanup (#781)

* Adding illegal LLM checks w/ unit testing + some minor cleanup

* Updated with TMA

* Misc LLM cleanup (#782)

* General cleanup

* Add ticket nums to todos

* Fix for non-LLM breaking bug introduced.

* summary table in progress

---------

Co-authored-by: Misha Chornyi <[email protected]>
Co-authored-by: Hyunjae Woo <[email protected]>
  • Loading branch information
3 people authored Nov 3, 2023
1 parent 32389de commit f15427e
Show file tree
Hide file tree
Showing 32 changed files with 1,828 additions and 462 deletions.
10 changes: 8 additions & 2 deletions model_analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,14 @@ def profile(

if not self._config.skip_summary_reports:
self._create_summary_tables(verbose)
self._create_summary_reports(mode)
self._create_detailed_reports(mode)

# TODO TMA-1401: need to figure out summary reporting for LLMs
if not self._config.is_llm_model():
self._create_summary_reports(mode)

# TODO TMA-1443: need to figure out detailed reporting for LLMs
if not self._config.is_llm_model():
self._create_detailed_reports(mode)

self._check_for_perf_analyzer_errors()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,7 @@ def __init__(
logger.info("")
AutomaticModelConfigGenerator._log_first_run = True

self._max_instance_count = config.run_config_search_max_instance_count
self._min_instance_count = config.run_config_search_min_instance_count
self._max_model_batch_size = config.run_config_search_max_model_batch_size
self._min_model_batch_size = config.run_config_search_min_model_batch_size
self._set_min_max_search_values(config)

self._instance_kind = "KIND_CPU" if self._cpu_only else "KIND_GPU"

Expand All @@ -91,7 +88,7 @@ def __init__(

self._reset_max_batch_size()

if not self._early_exit_enable:
if not self._early_exit_enable and not self._config.is_llm_model():
raise TritonModelAnalyzerException(
"Early exit disable is not supported in automatic model config generator"
)
Expand Down Expand Up @@ -162,3 +159,9 @@ def _get_curr_param_combo(self) -> Dict:
config["dynamic_batching"] = {}

return config

def _set_min_max_search_values(self, config: ConfigCommandProfile) -> None:
self._max_instance_count = config.run_config_search_max_instance_count
self._min_instance_count = config.run_config_search_min_instance_count
self._max_model_batch_size = config.run_config_search_max_model_batch_size
self._min_model_batch_size = config.run_config_search_min_model_batch_size
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from model_analyzer.config.run.run_config import RunConfig
from model_analyzer.constants import LOGGER_NAME
from model_analyzer.device.gpu_device import GPUDevice
from model_analyzer.result.parameter_search import ParameterSearch
from model_analyzer.result.inference_load_search import InferenceLoadSearch
from model_analyzer.result.result_manager import ResultManager
from model_analyzer.result.run_config_measurement import RunConfigMeasurement
from model_analyzer.triton.client.client import TritonClient
Expand All @@ -39,10 +39,10 @@
logger = logging.getLogger(LOGGER_NAME)


class BrutePlusBinaryParameterSearchRunConfigGenerator(ConfigGeneratorInterface):
class BrutePlusBinarySearchRunConfigGenerator(ConfigGeneratorInterface):
"""
First run BruteRunConfigGenerator for a brute search, then for
automatic searches use ParameterSearch to perform a binary search
automatic searches use InferenceLoadSearch to perform a binary search
"""

def __init__(
Expand Down Expand Up @@ -116,7 +116,11 @@ def _create_brute_run_config_generator(self) -> BruteRunConfigGenerator:

def _can_binary_search_top_results(self) -> bool:
for model in self._models:
if model.parameters()["concurrency"] or model.parameters()["request_rate"]:
if (
model.parameters()["concurrency"]
or model.parameters()["request_rate"]
or self._config.is_llm_model()
):
return False

return True
Expand All @@ -132,17 +136,19 @@ def _binary_search_over_top_results(self) -> Generator[RunConfig, None, None]:
for result in top_results:
run_config = deepcopy(result.run_config())
model_parameters = self._get_model_parameters(model_name)
parameter_search = ParameterSearch(
inference_load_search = InferenceLoadSearch(
config=self._config,
model_parameters=model_parameters,
skip_parameter_sweep=True,
skip_inference_load_sweep=True,
)
for parameter in parameter_search.search_parameters():
run_config = self._set_parameter(
run_config, model_parameters, parameter
for inference_load in inference_load_search.search_inference_loads():
run_config = self._set_inference_load(
run_config, model_parameters, inference_load
)
yield run_config
parameter_search.add_run_config_measurement(self._last_measurement)
inference_load_search.add_run_config_measurement(
self._last_measurement
)

def _get_model_parameters(self, model_name: str) -> Dict:
for model in self._models:
Expand All @@ -151,14 +157,14 @@ def _get_model_parameters(self, model_name: str) -> Dict:

return {}

def _set_parameter(
self, run_config: RunConfig, model_parameters: Dict, parameter: int
def _set_inference_load(
self, run_config: RunConfig, model_parameters: Dict, inference_load: int
) -> RunConfig:
for model_run_config in run_config.model_run_configs():
perf_config = model_run_config.perf_config()
if self._config.is_request_rate_specified(model_parameters):
perf_config.update_config({"request-rate-range": parameter})
perf_config.update_config({"request-rate-range": inference_load})
else:
perf_config.update_config({"concurrency-range": parameter})
perf_config.update_config({"concurrency-range": inference_load})

return run_config
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def __init__(
self._curr_results: List = [[] for n in range(self._num_models)]
self._curr_generators: Dict[int, ConfigGeneratorInterface] = {}

self._skip_default_config = skip_default_config
self._skip_default_config = skip_default_config or config.is_llm_model()

def set_last_results(
self, measurements: List[Optional[RunConfigMeasurement]]
Expand Down
42 changes: 39 additions & 3 deletions model_analyzer/config/generate/generator_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# limitations under the License.

from itertools import product
from typing import Dict, List
from typing import Dict, List, Optional


class GeneratorUtils:
Expand Down Expand Up @@ -80,8 +80,8 @@ def generate_combinations(value: object) -> List:
@staticmethod
def generate_parameter_combinations(params: Dict) -> List[Dict]:
"""
Generate a list of all possible subdictionaries
from given dictionary. The subdictionaries will
Generate a list of all possible sub-dictionaries
from given dictionary. The sub-dictionaries will
have all the same keys, but only one value from
each key.
Expand All @@ -108,9 +108,45 @@ def generate_doubled_list(min_value: int, max_value: int) -> List[int]:
The value that the generated list will not exceed
"""

assert min_value <= max_value

list = []
val = 1 if min_value == 0 else min_value
while val <= max_value:
list.append(val)
val *= 2
return list

@staticmethod
def extract_value_from_request_parameter(request_parameter: Optional[str]) -> int:
if not request_parameter:
return 0

# Format is: <parameter>:<value>:<type>
# Example: max_tokens:10:int
_, value, _ = request_parameter.split(":")

# this catches the case for non-LLM models where the user has specified request parameters
try:
int(value)
except ValueError as _:
return 0

return int(value)

@staticmethod
def extract_text_input_length_from_input_data(input_data: Optional[str]) -> int:
if not input_data:
return 0

# format is input-data-<num>.json
_, _, text_input_length = input_data.split("-")
text_input_length, _ = text_input_length.split(".")

# this catches the case for non-LLM models where the user has specified input data
try:
int(text_input_length)
except ValueError as _:
return 0

return int(text_input_length)
12 changes: 10 additions & 2 deletions model_analyzer/config/generate/model_run_config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,5 +150,13 @@ def _determine_early_exit_enables(
concurrency_specified = model.parameters()["concurrency"]
config_parameters_exist = model.model_config_parameters()

self._pacg_early_exit_enable = early_exit_enable or not concurrency_specified
self._mcg_early_exit_enable = early_exit_enable or not config_parameters_exist
if config.is_llm_model():
self._pacg_early_exit_enable = False
self._mcg_early_exit_enable = False
else:
self._pacg_early_exit_enable = (
early_exit_enable or not concurrency_specified
)
self._mcg_early_exit_enable = (
early_exit_enable or not config_parameters_exist
)
Loading

0 comments on commit f15427e

Please sign in to comment.