-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
cbbf81e
commit d8e99cc
Showing
6 changed files
with
278 additions
and
82 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,83 +1,12 @@ | ||
"""Build the llm leaderboard based on eval results. | ||
"""Leaderboard subcommand. | ||
""" | ||
|
||
import argparse | ||
import logging | ||
import pathlib | ||
import subprocess | ||
from typing import Any | ||
|
||
import yaml | ||
|
||
|
||
__all__ = [] | ||
|
||
_LOGGER = logging.getLogger(__name__) | ||
|
||
REPORT_DIR = "reports" | ||
DATASETS = [ | ||
"assist", | ||
"assist-mini", | ||
"intents", | ||
] | ||
IGNORE_REPORTS = { | ||
"reports/assist/2024.6.0dev-baseline-2024-05-27", | ||
"reports/assist/2024.6.0dev-v1-2024-05-27", | ||
"reports/assist/2024.6.0dev-v2-2024-05-29", | ||
"reports/assist/2024.6.0dev-v3-2024-05-31", | ||
} | ||
REPORT_FILE = "reports.yaml" | ||
|
||
|
||
EVAL_CMD = [ | ||
"home-assistant-datasets", | ||
"assist", | ||
"eval", | ||
"--output_type=report", | ||
] | ||
``` | ||
usage: home-assistant-datasets leaderboard [-h] {prebuild,build} ... | ||
positional arguments: | ||
{prebuild,build} Sub Action | ||
def create_arguments(args: argparse.ArgumentParser) -> None: | ||
"""Get parsed passed in arguments.""" | ||
args.add_argument( | ||
"--report-dir", | ||
type=str, | ||
default=REPORT_DIR, | ||
help="Specifies the report dataset directory created by `eval` commands", | ||
) | ||
|
||
|
||
def run(args: argparse.Namespace) -> int: | ||
"""Run the command line action.""" | ||
report_dir = pathlib.Path(args.report_dir) | ||
|
||
for dataset in DATASETS: | ||
dataset_dir = report_dir / dataset | ||
for filename in dataset_dir.iterdir(): | ||
if not filename.is_dir(): | ||
continue | ||
if str(filename) in IGNORE_REPORTS: | ||
_LOGGER.debug("Ignoring report directory %s", filename) | ||
continue | ||
|
||
print(f"Generating report for outputs in {filename}") | ||
|
||
filename_parts = str(filename).split("/") | ||
assert filename_parts[0] == REPORT_DIR | ||
assert len(filename_parts) >= 3, filename_parts | ||
assert dataset == filename_parts[1], filename_parts | ||
dataset_label = filename_parts[2] | ||
print(f"Generating report for {dataset} {dataset_label}") | ||
|
||
cmds = EVAL_CMD + [f"--model_output_dir={filename}"] | ||
p = subprocess.Popen(cmds, stdout=subprocess.PIPE) | ||
(report_output, _) = p.communicate() | ||
if p.returncode: | ||
return p.returncode | ||
|
||
output_file = filename / REPORT_FILE | ||
output_file.write_bytes(report_output) | ||
print(f"Writing {output_file}") | ||
|
||
return 0 | ||
options: | ||
-h, --help show this help message and exit | ||
``` | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
"""Build the llm leaderboard based on the pre-build eval results. | ||
``` | ||
usage: home-assistant-datasets leaderboard build [-h] [--report-dir REPORT_DIR] | ||
options: | ||
-h, --help show this help message and exit | ||
--report-dir REPORT_DIR | ||
Specifies the report dataset directory created by `eval` commands | ||
``` | ||
""" | ||
|
||
import argparse | ||
import logging | ||
from dataclasses import dataclass | ||
import math | ||
import pathlib | ||
import subprocess | ||
from typing import Any | ||
|
||
import yaml | ||
|
||
from .config import REPORT_DIR, DATASETS, IGNORE_REPORTS, REPORT_FILE, eval_reports, EvalReport | ||
|
||
|
||
__all__ = [] | ||
|
||
_LOGGER = logging.getLogger(__name__) | ||
|
||
LEADERBOARD_FILE = "leaderboard.md" | ||
|
||
|
||
def create_arguments(args: argparse.ArgumentParser) -> None: | ||
"""Get parsed passed in arguments.""" | ||
args.add_argument( | ||
"--report-dir", | ||
type=str, | ||
default=REPORT_DIR, | ||
help="Specifies the report dataset directory created by `eval` commands", | ||
) | ||
|
||
|
||
@dataclass | ||
class ModelRecord: | ||
model_id: str | ||
dataset: str | ||
dataset_label: str | ||
good: int | ||
total: int | ||
good_percent: str | ||
|
||
def good_percent_value(self) -> float: | ||
return self.good / self.total | ||
|
||
@property | ||
def stddev(self) -> float: | ||
"""Compute the stddev of the score.""" | ||
p = self.good_percent_value() | ||
return math.sqrt((p * (1 - p)) / self.total) | ||
|
||
|
||
|
||
def run(args: argparse.Namespace) -> int: | ||
"""Run the command line action.""" | ||
report_dir = pathlib.Path(args.report_dir) | ||
|
||
model_scores: dict[str, dict[str, list[ModelRecord]]] = {} | ||
for eval_report in eval_reports(report_dir): | ||
report_file = eval_report.report_file | ||
if not report_file.exists: | ||
raise ValueError(f"Report file {report_file} does not exist, run `prebuild` first") | ||
|
||
report = yaml.load(eval_report.report_file.read_text(), Loader=yaml.CSafeLoader) | ||
for model_data in report: | ||
model_id = model_data["model_id"] | ||
if model_id not in model_scores: | ||
model_scores[model_id] = {} | ||
if eval_report.dataset not in model_scores[model_id]: | ||
model_scores[model_id][eval_report.dataset] = [] | ||
|
||
model_scores[model_id][eval_report.dataset].append( | ||
ModelRecord( | ||
**model_data, | ||
dataset=eval_report.dataset, | ||
dataset_label=eval_report.dataset_label, | ||
) | ||
) | ||
|
||
|
||
# Sort reports by their best scores | ||
for model_id in model_scores: | ||
for dataset in DATASETS: | ||
if dataset not in model_scores[model_id]: | ||
model_scores[model_id][dataset] = [] | ||
records = model_scores[model_id][dataset] | ||
records = sorted(records, key=ModelRecord.good_percent_value, reverse=True) | ||
model_scores[model_id][eval_report.dataset] = records | ||
|
||
# Build leaderboard sorted by the first dataset score | ||
def best_score(model_id: str) -> float: | ||
records = model_scores[model_id][DATASETS[0]] | ||
return records[0].good_percent_value() if records else 0 | ||
|
||
sorted_model_ids = sorted(model_scores.keys(), key=best_score, reverse=True) | ||
|
||
|
||
results = [ | ||
["| Model | ", " | ".join(DATASETS), "|"], | ||
["| ----- " * (len(DATASETS) + 1), "|"], | ||
] | ||
for model_id in sorted_model_ids: | ||
row = [f"| {model_id} "] | ||
for dataset in DATASETS: | ||
records = model_scores[model_id][dataset] | ||
if records: | ||
best_record = records[0] | ||
row.append(f"| {best_record.good_percent_value()*100:0.2f}% (+/- {best_record.stddev*100:0.2f}%) ") | ||
else: | ||
row.append(f"| 0 ") | ||
row.append("|") | ||
results.append(row) | ||
|
||
|
||
leaderboard_file = report_dir / LEADERBOARD_FILE | ||
print(f"Updating {leaderboard_file}") | ||
leaderboard_file.write_text("\n".join([ | ||
"".join(row) | ||
for row in results | ||
])) | ||
|
||
return 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
"""Configuration for the leaderboard.""" | ||
|
||
import logging | ||
import pathlib | ||
from dataclasses import dataclass | ||
from collections.abc import Generator | ||
|
||
_LOGGER = logging.getLogger(__name__) | ||
|
||
REPORT_DIR = "reports" | ||
DATASETS = [ | ||
"assist", | ||
"assist-mini", | ||
"intents", | ||
] | ||
IGNORE_REPORTS = { | ||
"reports/assist/2024.6.0dev-baseline-2024-05-27", | ||
"reports/assist/2024.6.0dev-v1-2024-05-27", | ||
"reports/assist/2024.6.0dev-v2-2024-05-29", | ||
"reports/assist/2024.6.0dev-v3-2024-05-31", | ||
} | ||
REPORT_FILE = "reports.yaml" | ||
|
||
|
||
@dataclass | ||
class EvalReport: | ||
directory: pathlib.Path | ||
dataset: str # e.g. assist-mini | ||
dataset_label: str # e.g. home assistant version | ||
|
||
@property | ||
def report_file(self) -> pathlib.Path: | ||
return self.directory / REPORT_FILE | ||
|
||
|
||
def eval_reports(report_dir: pathlib.Path) -> Generator[EvalReport]: | ||
"""Generate the list of eval reports.""" | ||
for dataset in DATASETS: | ||
dataset_dir = report_dir / dataset | ||
for filename in dataset_dir.iterdir(): | ||
if not filename.is_dir(): | ||
continue | ||
if str(filename) in IGNORE_REPORTS: | ||
_LOGGER.debug("Ignoring report directory %s", filename) | ||
continue | ||
|
||
filename_parts = str(filename).split("/") | ||
assert filename_parts[0] == REPORT_DIR | ||
assert len(filename_parts) >= 3, filename_parts | ||
assert dataset == filename_parts[1], filename_parts | ||
dataset_label = filename_parts[2] | ||
|
||
yield EvalReport(filename, dataset, dataset_label) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
"""Build all the assist eval reports needed to build the leaderboard. | ||
``` | ||
usage: home-assistant-datasets leaderboard prebuild [-h] [--report-dir REPORT_DIR] | ||
options: | ||
-h, --help show this help message and exit | ||
--report-dir REPORT_DIR | ||
Specifies the report dataset directory created by `eval` commands | ||
``` | ||
""" | ||
|
||
import argparse | ||
import logging | ||
import pathlib | ||
import subprocess | ||
from typing import Any | ||
|
||
import yaml | ||
|
||
from .config import REPORT_DIR, DATASETS, IGNORE_REPORTS, REPORT_FILE, eval_reports | ||
|
||
__all__ = [] | ||
|
||
_LOGGER = logging.getLogger(__name__) | ||
|
||
|
||
|
||
EVAL_CMD = [ | ||
"home-assistant-datasets", | ||
"assist", | ||
"eval", | ||
"--output_type=report", | ||
] | ||
|
||
|
||
def create_arguments(args: argparse.ArgumentParser) -> None: | ||
"""Get parsed passed in arguments.""" | ||
args.add_argument( | ||
"--report-dir", | ||
type=str, | ||
default=REPORT_DIR, | ||
help="Specifies the report dataset directory created by `eval` commands", | ||
) | ||
|
||
|
||
def run(args: argparse.Namespace) -> int: | ||
"""Run the command line action.""" | ||
report_dir = pathlib.Path(args.report_dir) | ||
|
||
for eval_report in eval_reports(report_dir): | ||
print(f"Generating report for outputs in {eval_report.directory}") | ||
cmds = EVAL_CMD + [f"--model_output_dir={eval_report.directory}"] | ||
_LOGGER.debug(cmds) | ||
p = subprocess.Popen(cmds, stdout=subprocess.PIPE) | ||
(report_output, _) = p.communicate() | ||
if p.returncode: | ||
return p.returncode | ||
|
||
output_file = eval_report.report_file | ||
output_file.write_bytes(report_output) | ||
print(f"Writing {output_file}") | ||
|
||
return 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
| Model | assist | assist-mini | intents| | ||
| ----- | ----- | ----- | ----- | | ||
| gemini-1.5-flash | 91.25% (+/- 3.16%) | 97.96% (+/- 2.02%) | 97.96% (+/- 2.02%) | | ||
| gpt-4o-mini | 90.00% (+/- 3.35%) | 97.96% (+/- 2.02%) | 97.96% (+/- 2.02%) | | ||
| gpt-4o | 87.50% (+/- 3.70%) | 0 | 0 | | ||
| gpt-3.5 | 75.00% (+/- 4.84%) | 0 | 0 | | ||
| functionary-small-v2.5 | 56.25% (+/- 5.55%) | 63.27% (+/- 6.89%) | 63.27% (+/- 6.89%) | | ||
| llama3.1 | 45.57% (+/- 5.60%) | 83.67% (+/- 5.28%) | 83.67% (+/- 5.28%) | | ||
| home-llm | 45.00% (+/- 5.56%) | 34.69% (+/- 6.80%) | 34.69% (+/- 6.80%) | | ||
| assistant | 37.50% (+/- 5.41%) | 63.27% (+/- 6.89%) | 63.27% (+/- 6.89%) | | ||
| xlam-7b | 25.00% (+/- 4.84%) | 85.71% (+/- 5.00%) | 85.71% (+/- 5.00%) | | ||
| llama3-groq-tool-use | 20.00% (+/- 4.47%) | 51.02% (+/- 7.14%) | 51.02% (+/- 7.14%) | | ||
| mistral-v3 | 3.75% (+/- 2.12%) | 2.04% (+/- 2.02%) | 2.04% (+/- 2.02%) | | ||
| xlam-1b | 0 | 27.08% (+/- 6.41%) | 27.08% (+/- 6.41%) | |