Build LLM leaderboard

allenporter · Aug 4, 2024 · d8e99cc · d8e99cc
1 parent cbbf81e
commit d8e99cc
Show file tree

Hide file tree

Showing 6 changed files with 278 additions and 82 deletions.
diff --git a/home_assistant_datasets/tool/__main__.py b/home_assistant_datasets/tool/__main__.py
@@ -6,7 +6,7 @@
 import sys
 from pathlib import Path
 
-from . import leaderboard
+from .leaderboard import prebuild as leaderboard_prebuild, build as leaderboard_build
 from .assist import collect as assist_collect, eval as assist_eval
 
 
@@ -27,7 +27,12 @@ def get_base_arg_parser() -> argparse.ArgumentParser:
     assist_collect.create_arguments(assist_subparsers.add_parser("collect"))
     assist_eval.create_arguments(assist_subparsers.add_parser("eval"))
 
-    leaderboard.create_arguments(subparsers.add_parser("leaderboard"))
+    leaderboard_parser = subparsers.add_parser("leaderboard")
+    leaderboard_subparsers = leaderboard_parser.add_subparsers(
+        dest="subaction", help="Sub Action", required=True
+    )
+    leaderboard_prebuild.create_arguments(leaderboard_subparsers.add_parser("prebuild"))
+    leaderboard_build.create_arguments(leaderboard_subparsers.add_parser("build"))
 
     return parser
 

diff --git a/home_assistant_datasets/tool/leaderboard/__init__.py b/home_assistant_datasets/tool/leaderboard/__init__.py
@@ -1,83 +1,12 @@
-"""Build the llm leaderboard based on eval results.
+"""Leaderboard subcommand.
 
-"""
-
-import argparse
-import logging
-import pathlib
-import subprocess
-from typing import Any
-
-import yaml
-
-
-__all__ = []
-
-_LOGGER = logging.getLogger(__name__)
-
-REPORT_DIR = "reports"
-DATASETS = [
-    "assist",
-    "assist-mini",
-    "intents",
-]
-IGNORE_REPORTS = {
-    "reports/assist/2024.6.0dev-baseline-2024-05-27",
-    "reports/assist/2024.6.0dev-v1-2024-05-27",
-    "reports/assist/2024.6.0dev-v2-2024-05-29",
-    "reports/assist/2024.6.0dev-v3-2024-05-31",
-}
-REPORT_FILE = "reports.yaml"
-
-
-EVAL_CMD = [
-    "home-assistant-datasets",
-    "assist",
-    "eval",
-    "--output_type=report",
-]
+```
+usage: home-assistant-datasets leaderboard [-h] {prebuild,build} ...
 
+positional arguments:
+  {prebuild,build}  Sub Action
 
-def create_arguments(args: argparse.ArgumentParser) -> None:
-    """Get parsed passed in arguments."""
-    args.add_argument(
-        "--report-dir",
-        type=str,
-        default=REPORT_DIR,
-        help="Specifies the report dataset directory created by `eval` commands",
-    )
-
-
-def run(args: argparse.Namespace) -> int:
-    """Run the command line action."""
-    report_dir = pathlib.Path(args.report_dir)
-
-    for dataset in DATASETS:
-        dataset_dir = report_dir / dataset
-        for filename in dataset_dir.iterdir():
-            if not filename.is_dir():
-                continue
-            if str(filename) in IGNORE_REPORTS:
-                _LOGGER.debug("Ignoring report directory %s", filename)
-                continue
-
-            print(f"Generating report for outputs in {filename}")
-
-            filename_parts = str(filename).split("/")
-            assert filename_parts[0] == REPORT_DIR
-            assert len(filename_parts) >= 3, filename_parts
-            assert dataset == filename_parts[1], filename_parts
-            dataset_label = filename_parts[2]
-            print(f"Generating report for {dataset} {dataset_label}")
-
-            cmds = EVAL_CMD + [f"--model_output_dir={filename}"]
-            p = subprocess.Popen(cmds, stdout=subprocess.PIPE)
-            (report_output, _) = p.communicate()
-            if p.returncode:
-                return p.returncode
-
-            output_file = filename / REPORT_FILE
-            output_file.write_bytes(report_output)
-            print(f"Writing {output_file}")
-
-    return 0
+options:
+  -h, --help        show this help message and exit
+```
+"""
diff --git a/home_assistant_datasets/tool/leaderboard/build.py b/home_assistant_datasets/tool/leaderboard/build.py
@@ -0,0 +1,131 @@
+"""Build the llm leaderboard based on the pre-build eval results.
+
+```
+usage: home-assistant-datasets leaderboard build [-h] [--report-dir REPORT_DIR]
+
+options:
+  -h, --help            show this help message and exit
+  --report-dir REPORT_DIR
+                        Specifies the report dataset directory created by `eval` commands
+```
+"""
+
+import argparse
+import logging
+from dataclasses import dataclass
+import math
+import pathlib
+import subprocess
+from typing import Any
+
+import yaml
+
+from .config import REPORT_DIR, DATASETS, IGNORE_REPORTS, REPORT_FILE, eval_reports, EvalReport
+
+
+__all__ = []
+
+_LOGGER = logging.getLogger(__name__)
+
+LEADERBOARD_FILE = "leaderboard.md"
+
+
+def create_arguments(args: argparse.ArgumentParser) -> None:
+    """Get parsed passed in arguments."""
+    args.add_argument(
+        "--report-dir",
+        type=str,
+        default=REPORT_DIR,
+        help="Specifies the report dataset directory created by `eval` commands",
+    )
+
+
+@dataclass
+class ModelRecord:
+    model_id: str
+    dataset: str
+    dataset_label: str
+    good: int
+    total: int
+    good_percent: str
+
+    def good_percent_value(self) -> float:
+        return self.good / self.total
+
+    @property
+    def stddev(self) -> float:
+        """Compute the stddev of the score."""
+        p = self.good_percent_value()
+        return math.sqrt((p * (1 - p)) / self.total)
+
+
+
+def run(args: argparse.Namespace) -> int:
+    """Run the command line action."""
+    report_dir = pathlib.Path(args.report_dir)
+
+    model_scores: dict[str, dict[str, list[ModelRecord]]] = {}
+    for eval_report in eval_reports(report_dir):
+        report_file = eval_report.report_file
+        if not report_file.exists:
+            raise ValueError(f"Report file {report_file} does not exist, run `prebuild` first")
+
+        report = yaml.load(eval_report.report_file.read_text(), Loader=yaml.CSafeLoader)
+        for model_data in report:
+            model_id = model_data["model_id"]
+            if model_id not in model_scores:
+                model_scores[model_id] = {}
+            if eval_report.dataset not in model_scores[model_id]:
+                model_scores[model_id][eval_report.dataset] = []
+
+            model_scores[model_id][eval_report.dataset].append(
+                ModelRecord(
+                    **model_data,
+                    dataset=eval_report.dataset,
+                    dataset_label=eval_report.dataset_label,
+                )
+            )
+
+
+    # Sort reports by their best scores
+    for model_id in model_scores:
+        for dataset in DATASETS:
+            if dataset not in model_scores[model_id]:
+                model_scores[model_id][dataset] = []
+            records = model_scores[model_id][dataset]
+            records = sorted(records, key=ModelRecord.good_percent_value, reverse=True)
+            model_scores[model_id][eval_report.dataset] = records
+
+    # Build leaderboard sorted by the first dataset score
+    def best_score(model_id: str) -> float:
+        records = model_scores[model_id][DATASETS[0]]
+        return records[0].good_percent_value() if records else 0
+
+    sorted_model_ids = sorted(model_scores.keys(), key=best_score, reverse=True)
+
+
+    results = [
+        ["| Model | ", " | ".join(DATASETS), "|"],
+        ["| ----- " * (len(DATASETS) + 1), "|"],
+    ]
+    for model_id in sorted_model_ids:
+        row = [f"| {model_id} "]
+        for dataset in DATASETS:
+            records = model_scores[model_id][dataset]
+            if records:
+                best_record = records[0]
+                row.append(f"| {best_record.good_percent_value()*100:0.2f}% (+/- {best_record.stddev*100:0.2f}%) ")
+            else:
+                row.append(f"| 0 ")
+        row.append("|")
+        results.append(row)
+
+
+    leaderboard_file = report_dir / LEADERBOARD_FILE
+    print(f"Updating {leaderboard_file}")
+    leaderboard_file.write_text("\n".join([
+        "".join(row)
+        for row in results
+    ]))
+
+    return 0
diff --git a/home_assistant_datasets/tool/leaderboard/config.py b/home_assistant_datasets/tool/leaderboard/config.py
@@ -0,0 +1,53 @@
+"""Configuration for the leaderboard."""
+
+import logging
+import pathlib
+from dataclasses import dataclass
+from collections.abc import Generator
+
+_LOGGER = logging.getLogger(__name__)
+
+REPORT_DIR = "reports"
+DATASETS = [
+    "assist",
+    "assist-mini",
+    "intents",
+]
+IGNORE_REPORTS = {
+    "reports/assist/2024.6.0dev-baseline-2024-05-27",
+    "reports/assist/2024.6.0dev-v1-2024-05-27",
+    "reports/assist/2024.6.0dev-v2-2024-05-29",
+    "reports/assist/2024.6.0dev-v3-2024-05-31",
+}
+REPORT_FILE = "reports.yaml"
+
+
+@dataclass
+class EvalReport:
+    directory: pathlib.Path
+    dataset: str  # e.g. assist-mini
+    dataset_label: str  # e.g. home assistant version
+
+    @property
+    def report_file(self) -> pathlib.Path:
+        return self.directory / REPORT_FILE
+
+
+def eval_reports(report_dir: pathlib.Path) -> Generator[EvalReport]:
+    """Generate the list of eval reports."""
+    for dataset in DATASETS:
+        dataset_dir = report_dir / dataset
+        for filename in dataset_dir.iterdir():
+            if not filename.is_dir():
+                continue
+            if str(filename) in IGNORE_REPORTS:
+                _LOGGER.debug("Ignoring report directory %s", filename)
+                continue
+
+            filename_parts = str(filename).split("/")
+            assert filename_parts[0] == REPORT_DIR
+            assert len(filename_parts) >= 3, filename_parts
+            assert dataset == filename_parts[1], filename_parts
+            dataset_label = filename_parts[2]
+
+            yield EvalReport(filename, dataset, dataset_label)
diff --git a/home_assistant_datasets/tool/leaderboard/prebuild.py b/home_assistant_datasets/tool/leaderboard/prebuild.py
@@ -0,0 +1,64 @@
+"""Build all the assist eval reports needed to build the leaderboard.
+
+```
+usage: home-assistant-datasets leaderboard prebuild [-h] [--report-dir REPORT_DIR]
+
+options:
+  -h, --help            show this help message and exit
+  --report-dir REPORT_DIR
+                        Specifies the report dataset directory created by `eval` commands
+```
+"""
+
+import argparse
+import logging
+import pathlib
+import subprocess
+from typing import Any
+
+import yaml
+
+from .config import REPORT_DIR, DATASETS, IGNORE_REPORTS, REPORT_FILE, eval_reports
+
+__all__ = []
+
+_LOGGER = logging.getLogger(__name__)
+
+
+
+EVAL_CMD = [
+    "home-assistant-datasets",
+    "assist",
+    "eval",
+    "--output_type=report",
+]
+
+
+def create_arguments(args: argparse.ArgumentParser) -> None:
+    """Get parsed passed in arguments."""
+    args.add_argument(
+        "--report-dir",
+        type=str,
+        default=REPORT_DIR,
+        help="Specifies the report dataset directory created by `eval` commands",
+    )
+
+
+def run(args: argparse.Namespace) -> int:
+    """Run the command line action."""
+    report_dir = pathlib.Path(args.report_dir)
+
+    for eval_report in eval_reports(report_dir):
+        print(f"Generating report for outputs in {eval_report.directory}")
+        cmds = EVAL_CMD + [f"--model_output_dir={eval_report.directory}"]
+        _LOGGER.debug(cmds)
+        p = subprocess.Popen(cmds, stdout=subprocess.PIPE)
+        (report_output, _) = p.communicate()
+        if p.returncode:
+            return p.returncode
+
+        output_file = eval_report.report_file
+        output_file.write_bytes(report_output)
+        print(f"Writing {output_file}")
+
+    return 0
diff --git a/reports/leaderboard.md b/reports/leaderboard.md
@@ -0,0 +1,14 @@
+| Model | assist | assist-mini | intents|
+| ----- | ----- | ----- | ----- |
+| gemini-1.5-flash | 91.25% (+/- 3.16%) | 97.96% (+/- 2.02%) | 97.96% (+/- 2.02%) |
+| gpt-4o-mini | 90.00% (+/- 3.35%) | 97.96% (+/- 2.02%) | 97.96% (+/- 2.02%) |
+| gpt-4o | 87.50% (+/- 3.70%) | 0 | 0 |
+| gpt-3.5 | 75.00% (+/- 4.84%) | 0 | 0 |
+| functionary-small-v2.5 | 56.25% (+/- 5.55%) | 63.27% (+/- 6.89%) | 63.27% (+/- 6.89%) |
+| llama3.1 | 45.57% (+/- 5.60%) | 83.67% (+/- 5.28%) | 83.67% (+/- 5.28%) |
+| home-llm | 45.00% (+/- 5.56%) | 34.69% (+/- 6.80%) | 34.69% (+/- 6.80%) |
+| assistant | 37.50% (+/- 5.41%) | 63.27% (+/- 6.89%) | 63.27% (+/- 6.89%) |
+| xlam-7b | 25.00% (+/- 4.84%) | 85.71% (+/- 5.00%) | 85.71% (+/- 5.00%) |
+| llama3-groq-tool-use | 20.00% (+/- 4.47%) | 51.02% (+/- 7.14%) | 51.02% (+/- 7.14%) |
+| mistral-v3 | 3.75% (+/- 2.12%) | 2.04% (+/- 2.02%) | 2.04% (+/- 2.02%) |
+| xlam-1b | 0 | 27.08% (+/- 6.41%) | 27.08% (+/- 6.41%) |