Skip to content

Commit

Permalink
Capture LLM metrics from PA (#774)
Browse files Browse the repository at this point in the history
* Initial code for aggregation of new LLM metrics

* New measurement fields created.

* Fixing PA unit tests

* Adding hooks in metrics to capture new LLM fields

* Fixing codeQL errors

* Fixing type checking errors

* Changes needed post-merge from other branches

* Revert naming mistake (due to merge).

* Changes uncovered during live testing

* Fixes based on hwoo review

* Fixing typo

* Change to use lists and mean()

* Changes based on hwoo review
  • Loading branch information
nv-braf authored Oct 17, 2023
1 parent 6e0fc24 commit e81a369
Show file tree
Hide file tree
Showing 7 changed files with 235 additions and 49 deletions.
103 changes: 94 additions & 9 deletions model_analyzer/perf_analyzer/perf_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import csv
import glob
import json
import logging
import os
import re
Expand All @@ -25,6 +26,7 @@
from typing import Dict, List

import psutil
from numpy import mean

from model_analyzer.constants import (
INTERVAL_SLEEP_TIME,
Expand Down Expand Up @@ -118,6 +120,14 @@ def get_gpu_metrics():
]
return gpu_metrics

@staticmethod
def get_llm_metrics():
llm_metrics = [
llm_metric[PerfAnalyzer.RECORD_CLASS]
for llm_metric in PerfAnalyzer.llm_metric_table
]
return llm_metrics

def __init__(self, path, config, max_retries, timeout, max_cpu_util):
"""
Parameters
Expand All @@ -143,6 +153,7 @@ def __init__(self, path, config, max_retries, timeout, max_cpu_util):
self._output = ""
self._perf_records = {}
self._gpu_records = []
self._llm_records = {}
self._max_cpu_util = max_cpu_util

def run(self, metrics, env=None):
Expand Down Expand Up @@ -216,6 +227,19 @@ def get_gpu_records(self):

return self._gpu_records

def get_llm_records(self):
"""
Returns
-------
The LLM records from the last perf_analyzer run
"""

if self._llm_records:
return self._llm_records
raise TritonModelAnalyzerException(
"Attempted to get perf_analyzer results without calling run first."
)

def output(self):
"""
Returns
Expand Down Expand Up @@ -457,21 +481,82 @@ def _parse_outputs(self, metrics):
logger.debug(
f"Reading PA results from {perf_config['latency-report-file']}"
)
with open(perf_config["latency-report-file"], mode="r") as f:
csv_reader = csv.DictReader(f, delimiter=",")

for row in csv_reader:
self._perf_records[
perf_config["model-name"]
] = self._extract_perf_records_from_row(metrics, row)
self._gpu_records = self._extract_gpu_records_from_row(metrics, row)
self._extract_gpu_records(perf_config, metrics)
self._extract_llm_records(perf_config, metrics)

for perf_config in [
mrc.perf_config() for mrc in self._config.model_run_configs()
]:
# Remove the latency file and all associated composing model latency files
# Remove the latency/profile export files and all associated composing model latency files
for f in glob.glob(f"*{perf_config['latency-report-file']}"):
os.remove(f)
for f in glob.glob(f"*{perf_config['profile-export-file']}"):
os.remove(f)

def _extract_gpu_records(self, perf_config, metrics):
if perf_config["profile-export-file"]:
return

with open(perf_config["latency-report-file"], mode="r") as f:
csv_reader = csv.DictReader(f, delimiter=",")

for row in csv_reader:
self._perf_records[
perf_config["model-name"]
] = self._extract_perf_records_from_row(metrics, row)
self._gpu_records = self._extract_gpu_records_from_row(metrics, row)

def _extract_llm_records(self, perf_config, metrics):
if not perf_config["profile-export-file"]:
return

self._llm_records[perf_config["model-name"]] = []

with open(perf_config["profile-export-file"], mode="r") as f:
llm_output = json.load(f)

avg_first_token_latency = self._calculate_avg_first_token_latency(
llm_output
)
record = PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.RECORD_CLASS](
value=avg_first_token_latency
) # type: ignore

self._llm_records[perf_config["model-name"]].append(record)

avg_token_to_token_latency = self._calculate_avg_token_to_token_latency(
llm_output
)
record = PerfAnalyzer.llm_metric_table[1][PerfAnalyzer.RECORD_CLASS](
value=avg_token_to_token_latency
) # type: ignore
self._llm_records[perf_config["model-name"]].append(record)

def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float:
total_first_token_latencies = []
for request in llm_output["experiments"][0]["requests"]:
total_first_token_latencies.append(
request["response_timestamps"][0] - request["timestamp"]
)

avg_first_token_latency = mean(total_first_token_latencies)

return avg_first_token_latency

def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float:
token_to_token_latencies = []
for request in llm_output["experiments"][0]["requests"]:
response_to_response_latencies = []
prev_response = request["response_timestamps"][0]
for response in request["response_timestamps"][1:]:
response_to_response_latencies.append(response - prev_response)
prev_response = response

token_to_token_latencies.append(mean(response_to_response_latencies))

avg_token_to_token_latency = mean(token_to_token_latencies)

return avg_token_to_token_latency

def _extract_perf_records_from_row(
self, requested_metrics: List[Record], row_metrics: Dict[str, str]
Expand Down
9 changes: 8 additions & 1 deletion model_analyzer/perf_analyzer/perf_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class PerfAnalyzerConfig:
"metrics-interval",
"bls-composing-models",
"request-parameter",
"request-period",
]

input_to_options = [
Expand All @@ -82,6 +83,7 @@ class PerfAnalyzerConfig:
"url",
"protocol",
"latency-report-file",
"profile-export-file",
"http-header",
]

Expand Down Expand Up @@ -112,6 +114,7 @@ def __init__(self):
"-u": None,
"-i": None,
"-f": None,
"--profile-export-file": None,
"-H": None,
}
self._verbose = {"-v": None, "-v -v": None, "--verbose-csv": None}
Expand All @@ -123,6 +126,7 @@ def __init__(self):
"url": "-u",
"protocol": "-i",
"latency-report-file": "-f",
"profile-export-file": "--profile-export-file",
"http-header": "-H",
}

Expand Down Expand Up @@ -193,6 +197,9 @@ def update_config_from_profile_config(self, model_name, profile_config):
"verbose-csv": "--verbose-csv",
}

if profile_config.is_llm_model():
params.update({"profile-export-file": model_name + "-results.json"})

if profile_config.triton_launch_mode == "c_api":
params.update(
{
Expand Down Expand Up @@ -307,7 +314,7 @@ def remove_url_from_cli_string(cls, cli_string):
@classmethod
def remove_mrc_from_cli_string(cls, cli_string):
"""
utility function strips the measruement request count
utility function strips the measurement request count
from a cli string representation
Parameters
Expand Down
20 changes: 15 additions & 5 deletions model_analyzer/record/metrics_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ class MetricsManager:
"gpu_power_usage",
"cpu_available_ram",
"cpu_used_ram",
"avg_first_token_latency",
"avg_token_to_token_latency",
]

def __init__(self, config, client, server, gpus, result_manager, state_manager):
Expand Down Expand Up @@ -116,6 +118,7 @@ def __init__(self, config, client, server, gpus, result_manager, state_manager):
self._gpu_metrics,
self._perf_metrics,
self._cpu_metrics,
self._llm_metrics,
) = self._categorize_metrics(self.metrics, self._config.collect_cpu_metrics)
self._gpus = gpus
self._init_state()
Expand Down Expand Up @@ -160,21 +163,23 @@ def _categorize_metrics(metric_tags, collect_cpu_metrics=False):
Returns
-------
(list,list,list)
tuple of three lists (DCGM, PerfAnalyzer, CPU) metrics
(list,list,list,list)
tuple of four lists (DCGM, PerfAnalyzer, CPU, LLM) metrics
"""

gpu_metrics, perf_metrics, cpu_metrics = [], [], []
gpu_metrics, perf_metrics, cpu_metrics, llm_metrics = [], [], [], []
# Separates metrics and objectives into related lists
for metric in MetricsManager.get_metric_types(metric_tags):
if metric in PerfAnalyzer.get_gpu_metrics():
gpu_metrics.append(metric)
elif metric in PerfAnalyzer.get_perf_metrics():
perf_metrics.append(metric)
elif metric in PerfAnalyzer.get_llm_metrics():
llm_metrics.append(metric)
elif collect_cpu_metrics and (metric in CPUMonitor.cpu_metrics):
cpu_metrics.append(metric)

return gpu_metrics, perf_metrics, cpu_metrics
return gpu_metrics, perf_metrics, cpu_metrics, llm_metrics

def profile_server(self):
"""
Expand Down Expand Up @@ -556,6 +561,9 @@ def _run_perf_analyzer(
)

metrics_to_gather = self._perf_metrics + self._gpu_metrics
if self._config.is_llm_model():
metrics_to_gather += self._llm_metrics

status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env)

self._write_perf_analyzer_output(perf_output_writer, perf_analyzer)
Expand All @@ -564,7 +572,9 @@ def _run_perf_analyzer(
self._handle_unsuccessful_perf_analyzer_run(perf_analyzer)
return (None, None)

perf_records = perf_analyzer.get_perf_records()
perf_records = (
perf_analyzer.get_perf_records() + perf_analyzer.get_llm_records()
)
gpu_records = perf_analyzer.get_gpu_records()

aggregated_perf_records = self._aggregate_perf_records(perf_records)
Expand Down
2 changes: 1 addition & 1 deletion model_analyzer/record/types/avg_first_token_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
@total_ordering
class AvgFirstTokenLatency(DecreasingRecord):
"""
A record for perf_analyzer avg first token to token latency metric
A record for perf_analyzer average first token latency metric
"""

tag = "avg_first_token_latency"
Expand Down
2 changes: 1 addition & 1 deletion model_analyzer/record/types/avg_token_to_token_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
@total_ordering
class AvgTokenToTokenLatency(DecreasingRecord):
"""
A record for perf_analyzer avg token-to-token latency metric
A record for perf_analyzer average token-to-token latency metric
"""

tag = "avg_token_to_token_latency"
Expand Down
6 changes: 6 additions & 0 deletions tests/common/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ def convert_avg_gpu_metrics_to_data(avg_gpu_metric_values):
def construct_perf_analyzer_config(
model_name="my-model",
output_file_name="my-model-results.csv",
export_file_name="my-model-results.json",
batch_size=DEFAULT_BATCH_SIZES,
concurrency=DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
periodic_concurrency=DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY,
Expand All @@ -257,6 +258,8 @@ def construct_perf_analyzer_config(
The name of the model
output_file_name: str
The name of the output file
export_file_name: str
The name of the export file
batch_size: int
The batch size for this PA configuration
concurrency: int
Expand Down Expand Up @@ -285,6 +288,9 @@ def construct_perf_analyzer_config(
pa_config._options["-f"] = output_file_name
pa_config._options["-b"] = batch_size

if llm_search_mode:
pa_config._options["--profile-export-file"] = export_file_name

if request_rate:
pa_config._args["request-rate-range"] = request_rate
elif llm_search_mode:
Expand Down
Loading

0 comments on commit e81a369

Please sign in to comment.