Skip to content

Commit

Permalink
Build LLM leaderboard
Browse files Browse the repository at this point in the history
  • Loading branch information
allenporter committed Aug 4, 2024
1 parent cbbf81e commit d8e99cc
Show file tree
Hide file tree
Showing 6 changed files with 278 additions and 82 deletions.
9 changes: 7 additions & 2 deletions home_assistant_datasets/tool/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys
from pathlib import Path

from . import leaderboard
from .leaderboard import prebuild as leaderboard_prebuild, build as leaderboard_build
from .assist import collect as assist_collect, eval as assist_eval


Expand All @@ -27,7 +27,12 @@ def get_base_arg_parser() -> argparse.ArgumentParser:
assist_collect.create_arguments(assist_subparsers.add_parser("collect"))
assist_eval.create_arguments(assist_subparsers.add_parser("eval"))

leaderboard.create_arguments(subparsers.add_parser("leaderboard"))
leaderboard_parser = subparsers.add_parser("leaderboard")
leaderboard_subparsers = leaderboard_parser.add_subparsers(
dest="subaction", help="Sub Action", required=True
)
leaderboard_prebuild.create_arguments(leaderboard_subparsers.add_parser("prebuild"))
leaderboard_build.create_arguments(leaderboard_subparsers.add_parser("build"))

return parser

Expand Down
89 changes: 9 additions & 80 deletions home_assistant_datasets/tool/leaderboard/__init__.py
Original file line number Diff line number Diff line change
@@ -1,83 +1,12 @@
"""Build the llm leaderboard based on eval results.
"""Leaderboard subcommand.
"""

import argparse
import logging
import pathlib
import subprocess
from typing import Any

import yaml


__all__ = []

_LOGGER = logging.getLogger(__name__)

REPORT_DIR = "reports"
DATASETS = [
"assist",
"assist-mini",
"intents",
]
IGNORE_REPORTS = {
"reports/assist/2024.6.0dev-baseline-2024-05-27",
"reports/assist/2024.6.0dev-v1-2024-05-27",
"reports/assist/2024.6.0dev-v2-2024-05-29",
"reports/assist/2024.6.0dev-v3-2024-05-31",
}
REPORT_FILE = "reports.yaml"


EVAL_CMD = [
"home-assistant-datasets",
"assist",
"eval",
"--output_type=report",
]
```
usage: home-assistant-datasets leaderboard [-h] {prebuild,build} ...
positional arguments:
{prebuild,build} Sub Action
def create_arguments(args: argparse.ArgumentParser) -> None:
"""Get parsed passed in arguments."""
args.add_argument(
"--report-dir",
type=str,
default=REPORT_DIR,
help="Specifies the report dataset directory created by `eval` commands",
)


def run(args: argparse.Namespace) -> int:
"""Run the command line action."""
report_dir = pathlib.Path(args.report_dir)

for dataset in DATASETS:
dataset_dir = report_dir / dataset
for filename in dataset_dir.iterdir():
if not filename.is_dir():
continue
if str(filename) in IGNORE_REPORTS:
_LOGGER.debug("Ignoring report directory %s", filename)
continue

print(f"Generating report for outputs in {filename}")

filename_parts = str(filename).split("/")
assert filename_parts[0] == REPORT_DIR
assert len(filename_parts) >= 3, filename_parts
assert dataset == filename_parts[1], filename_parts
dataset_label = filename_parts[2]
print(f"Generating report for {dataset} {dataset_label}")

cmds = EVAL_CMD + [f"--model_output_dir={filename}"]
p = subprocess.Popen(cmds, stdout=subprocess.PIPE)
(report_output, _) = p.communicate()
if p.returncode:
return p.returncode

output_file = filename / REPORT_FILE
output_file.write_bytes(report_output)
print(f"Writing {output_file}")

return 0
options:
-h, --help show this help message and exit
```
"""
131 changes: 131 additions & 0 deletions home_assistant_datasets/tool/leaderboard/build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""Build the llm leaderboard based on the pre-build eval results.
```
usage: home-assistant-datasets leaderboard build [-h] [--report-dir REPORT_DIR]
options:
-h, --help show this help message and exit
--report-dir REPORT_DIR
Specifies the report dataset directory created by `eval` commands
```
"""

import argparse
import logging
from dataclasses import dataclass
import math
import pathlib
import subprocess
from typing import Any

import yaml

from .config import REPORT_DIR, DATASETS, IGNORE_REPORTS, REPORT_FILE, eval_reports, EvalReport


__all__ = []

_LOGGER = logging.getLogger(__name__)

LEADERBOARD_FILE = "leaderboard.md"


def create_arguments(args: argparse.ArgumentParser) -> None:
"""Get parsed passed in arguments."""
args.add_argument(
"--report-dir",
type=str,
default=REPORT_DIR,
help="Specifies the report dataset directory created by `eval` commands",
)


@dataclass
class ModelRecord:
model_id: str
dataset: str
dataset_label: str
good: int
total: int
good_percent: str

def good_percent_value(self) -> float:
return self.good / self.total

@property
def stddev(self) -> float:
"""Compute the stddev of the score."""
p = self.good_percent_value()
return math.sqrt((p * (1 - p)) / self.total)



def run(args: argparse.Namespace) -> int:
"""Run the command line action."""
report_dir = pathlib.Path(args.report_dir)

model_scores: dict[str, dict[str, list[ModelRecord]]] = {}
for eval_report in eval_reports(report_dir):
report_file = eval_report.report_file
if not report_file.exists:
raise ValueError(f"Report file {report_file} does not exist, run `prebuild` first")

report = yaml.load(eval_report.report_file.read_text(), Loader=yaml.CSafeLoader)
for model_data in report:
model_id = model_data["model_id"]
if model_id not in model_scores:
model_scores[model_id] = {}
if eval_report.dataset not in model_scores[model_id]:
model_scores[model_id][eval_report.dataset] = []

model_scores[model_id][eval_report.dataset].append(
ModelRecord(
**model_data,
dataset=eval_report.dataset,
dataset_label=eval_report.dataset_label,
)
)


# Sort reports by their best scores
for model_id in model_scores:
for dataset in DATASETS:
if dataset not in model_scores[model_id]:
model_scores[model_id][dataset] = []
records = model_scores[model_id][dataset]
records = sorted(records, key=ModelRecord.good_percent_value, reverse=True)
model_scores[model_id][eval_report.dataset] = records

# Build leaderboard sorted by the first dataset score
def best_score(model_id: str) -> float:
records = model_scores[model_id][DATASETS[0]]
return records[0].good_percent_value() if records else 0

sorted_model_ids = sorted(model_scores.keys(), key=best_score, reverse=True)


results = [
["| Model | ", " | ".join(DATASETS), "|"],
["| ----- " * (len(DATASETS) + 1), "|"],
]
for model_id in sorted_model_ids:
row = [f"| {model_id} "]
for dataset in DATASETS:
records = model_scores[model_id][dataset]
if records:
best_record = records[0]
row.append(f"| {best_record.good_percent_value()*100:0.2f}% (+/- {best_record.stddev*100:0.2f}%) ")
else:
row.append(f"| 0 ")
row.append("|")
results.append(row)


leaderboard_file = report_dir / LEADERBOARD_FILE
print(f"Updating {leaderboard_file}")
leaderboard_file.write_text("\n".join([
"".join(row)
for row in results
]))

return 0
53 changes: 53 additions & 0 deletions home_assistant_datasets/tool/leaderboard/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Configuration for the leaderboard."""

import logging
import pathlib
from dataclasses import dataclass
from collections.abc import Generator

_LOGGER = logging.getLogger(__name__)

REPORT_DIR = "reports"
DATASETS = [
"assist",
"assist-mini",
"intents",
]
IGNORE_REPORTS = {
"reports/assist/2024.6.0dev-baseline-2024-05-27",
"reports/assist/2024.6.0dev-v1-2024-05-27",
"reports/assist/2024.6.0dev-v2-2024-05-29",
"reports/assist/2024.6.0dev-v3-2024-05-31",
}
REPORT_FILE = "reports.yaml"


@dataclass
class EvalReport:
directory: pathlib.Path
dataset: str # e.g. assist-mini
dataset_label: str # e.g. home assistant version

@property
def report_file(self) -> pathlib.Path:
return self.directory / REPORT_FILE


def eval_reports(report_dir: pathlib.Path) -> Generator[EvalReport]:
"""Generate the list of eval reports."""
for dataset in DATASETS:
dataset_dir = report_dir / dataset
for filename in dataset_dir.iterdir():
if not filename.is_dir():
continue
if str(filename) in IGNORE_REPORTS:
_LOGGER.debug("Ignoring report directory %s", filename)
continue

filename_parts = str(filename).split("/")
assert filename_parts[0] == REPORT_DIR
assert len(filename_parts) >= 3, filename_parts
assert dataset == filename_parts[1], filename_parts
dataset_label = filename_parts[2]

yield EvalReport(filename, dataset, dataset_label)
64 changes: 64 additions & 0 deletions home_assistant_datasets/tool/leaderboard/prebuild.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""Build all the assist eval reports needed to build the leaderboard.
```
usage: home-assistant-datasets leaderboard prebuild [-h] [--report-dir REPORT_DIR]
options:
-h, --help show this help message and exit
--report-dir REPORT_DIR
Specifies the report dataset directory created by `eval` commands
```
"""

import argparse
import logging
import pathlib
import subprocess
from typing import Any

import yaml

from .config import REPORT_DIR, DATASETS, IGNORE_REPORTS, REPORT_FILE, eval_reports

__all__ = []

_LOGGER = logging.getLogger(__name__)



EVAL_CMD = [
"home-assistant-datasets",
"assist",
"eval",
"--output_type=report",
]


def create_arguments(args: argparse.ArgumentParser) -> None:
"""Get parsed passed in arguments."""
args.add_argument(
"--report-dir",
type=str,
default=REPORT_DIR,
help="Specifies the report dataset directory created by `eval` commands",
)


def run(args: argparse.Namespace) -> int:
"""Run the command line action."""
report_dir = pathlib.Path(args.report_dir)

for eval_report in eval_reports(report_dir):
print(f"Generating report for outputs in {eval_report.directory}")
cmds = EVAL_CMD + [f"--model_output_dir={eval_report.directory}"]
_LOGGER.debug(cmds)
p = subprocess.Popen(cmds, stdout=subprocess.PIPE)
(report_output, _) = p.communicate()
if p.returncode:
return p.returncode

output_file = eval_report.report_file
output_file.write_bytes(report_output)
print(f"Writing {output_file}")

return 0
14 changes: 14 additions & 0 deletions reports/leaderboard.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
| Model | assist | assist-mini | intents|
| ----- | ----- | ----- | ----- |
| gemini-1.5-flash | 91.25% (+/- 3.16%) | 97.96% (+/- 2.02%) | 97.96% (+/- 2.02%) |
| gpt-4o-mini | 90.00% (+/- 3.35%) | 97.96% (+/- 2.02%) | 97.96% (+/- 2.02%) |
| gpt-4o | 87.50% (+/- 3.70%) | 0 | 0 |
| gpt-3.5 | 75.00% (+/- 4.84%) | 0 | 0 |
| functionary-small-v2.5 | 56.25% (+/- 5.55%) | 63.27% (+/- 6.89%) | 63.27% (+/- 6.89%) |
| llama3.1 | 45.57% (+/- 5.60%) | 83.67% (+/- 5.28%) | 83.67% (+/- 5.28%) |
| home-llm | 45.00% (+/- 5.56%) | 34.69% (+/- 6.80%) | 34.69% (+/- 6.80%) |
| assistant | 37.50% (+/- 5.41%) | 63.27% (+/- 6.89%) | 63.27% (+/- 6.89%) |
| xlam-7b | 25.00% (+/- 4.84%) | 85.71% (+/- 5.00%) | 85.71% (+/- 5.00%) |
| llama3-groq-tool-use | 20.00% (+/- 4.47%) | 51.02% (+/- 7.14%) | 51.02% (+/- 7.14%) |
| mistral-v3 | 3.75% (+/- 2.12%) | 2.04% (+/- 2.02%) | 2.04% (+/- 2.02%) |
| xlam-1b | 0 | 27.08% (+/- 6.41%) | 27.08% (+/- 6.41%) |

0 comments on commit d8e99cc

Please sign in to comment.