From 5602c728fc52b1b9ef54a028ca7522139f21ec8d Mon Sep 17 00:00:00 2001 From: Allen Porter Date: Tue, 6 Aug 2024 05:05:57 +0000 Subject: [PATCH] Update CI format --- .../tool/leaderboard/build.py | 6 +--- reports/README.md | 28 +++++++++---------- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/home_assistant_datasets/tool/leaderboard/build.py b/home_assistant_datasets/tool/leaderboard/build.py index a7e076ce..18cc778d 100644 --- a/home_assistant_datasets/tool/leaderboard/build.py +++ b/home_assistant_datasets/tool/leaderboard/build.py @@ -149,19 +149,15 @@ def create_leaderboard_table( assert best_model_scores[first_model_id][dataset] num_samples = best_model_scores[first_model_id][dataset].total cols.append(f"{dataset} (n={num_samples})") - cols.append("95% CI / version") rows = [] for model_id, dataset_scores in best_model_scores.items(): row = [ model_id ] for dataset, best_record in dataset_scores.items(): if best_record.good_percent_value() != 0: - row.append( - f"{best_record.good_percent_value()*100:0.1f}%") ci = 1.96 * best_record.stddev*100 - row.append(f"+/- {ci:0.1f} / {best_record.dataset_label}") + row.append(f"{best_record.good_percent_value()*100:0.1f}%
CI: +/- {ci:0.1f} / {best_record.dataset_label}") else: row.append("") - row.append("") rows.append(row) return table.table(cols, rows) diff --git a/reports/README.md b/reports/README.md index d1283f27..681e91e9 100644 --- a/reports/README.md +++ b/reports/README.md @@ -1,18 +1,18 @@ # Home LLM Leaderboard -| Model | assist (n=80) | 95% CI / version | assist-mini (n=49) | 95% CI / version | intents (n=165) | 95% CI / version | -| --- | --- | --- | --- | --- | --- | --- | -| gemini-1.5-flash | 91.2% | +/- 6.2 / 2024.6.3 | 98.0% | +/- 4.0 / 2024.8.0dev | 63.0% | +/- 7.4 / 2024.8.0b | -| gpt-4o-mini | 90.0% | +/- 6.6 / 2024.8.0b | 98.0% | +/- 4.0 / 2024.8.0dev | 63.6% | +/- 7.3 / 2024.8.0b | -| gpt-4o | 87.5% | +/- 7.2 / 2024.6.3 | | | 81.2% | +/- 6.0 / 2024.6.3 | -| gpt-3.5 | 75.0% | +/- 9.5 / 2024.6.3 | | | 67.9% | +/- 7.1 / 2024.6.3 | -| functionary-small-v2.5 | 56.2% | +/- 10.9 / 2024.7.0 | 63.3% | +/- 13.5 / 2024.8.0dev | 37.6% | +/- 7.4 / 2024.6.3 | -| llama3.1 | 45.6% | +/- 11.0 / 2024.8.0b | 83.7% | +/- 10.3 / 2024.8.0b0 | 22.6% | +/- 6.4 / 2024.8.0b | -| home-llm | 45.0% | +/- 10.9 / 2024.6.3 | 34.7% | +/- 13.3 / 2024.8.0dev | 25.5% | +/- 6.6 / 2024.6.3 | -| assistant | 37.5% | +/- 10.6 / 2024.6.3 | 63.3% | +/- 13.5 / 2024.8.0dev | 98.8% | +/- 1.7 / 2024.6.3 | -| xlam-7b | 25.0% | +/- 9.5 / 2024.8.0b | 85.7% | +/- 9.8 / 2024.8.0b0 | | | -| llama3-groq-tool-use | 20.0% | +/- 8.8 / 2024.8.0b | 51.0% | +/- 14.0 / 2024.8.0b0 | 11.5% | +/- 4.9 / 2024.8.0b | -| mistral-v3 | 3.8% | +/- 4.2 / 2024.8.0b | 2.0% | +/- 4.0 / 2024.8.0dev | 10.3% | +/- 4.6 / 2024.8.0b | -| xlam-1b | | | 27.1% | +/- 12.6 / 2024.8.0b0 | | | +| Model | assist (n=80) | assist-mini (n=49) | intents (n=165) | +| --- | --- | --- | --- | +| gemini-1.5-flash | 91.2%
CI: +/- 6.2 / 2024.6.3 | 98.0%
CI: +/- 4.0 / 2024.8.0dev | 63.0%
CI: +/- 7.4 / 2024.8.0b | +| gpt-4o-mini | 90.0%
CI: +/- 6.6 / 2024.8.0b | 98.0%
CI: +/- 4.0 / 2024.8.0dev | 63.6%
CI: +/- 7.3 / 2024.8.0b | +| gpt-4o | 87.5%
CI: +/- 7.2 / 2024.6.3 | | 81.2%
CI: +/- 6.0 / 2024.6.3 | +| gpt-3.5 | 75.0%
CI: +/- 9.5 / 2024.6.3 | | 67.9%
CI: +/- 7.1 / 2024.6.3 | +| functionary-small-v2.5 | 56.2%
CI: +/- 10.9 / 2024.7.0 | 63.3%
CI: +/- 13.5 / 2024.8.0dev | 37.6%
CI: +/- 7.4 / 2024.6.3 | +| llama3.1 | 45.6%
CI: +/- 11.0 / 2024.8.0b | 83.7%
CI: +/- 10.3 / 2024.8.0b0 | 22.6%
CI: +/- 6.4 / 2024.8.0b | +| home-llm | 45.0%
CI: +/- 10.9 / 2024.6.3 | 34.7%
CI: +/- 13.3 / 2024.8.0dev | 25.5%
CI: +/- 6.6 / 2024.6.3 | +| assistant | 37.5%
CI: +/- 10.6 / 2024.6.3 | 63.3%
CI: +/- 13.5 / 2024.8.0dev | 98.8%
CI: +/- 1.7 / 2024.6.3 | +| xlam-7b | 25.0%
CI: +/- 9.5 / 2024.8.0b | 85.7%
CI: +/- 9.8 / 2024.8.0b0 | | +| llama3-groq-tool-use | 20.0%
CI: +/- 8.8 / 2024.8.0b | 51.0%
CI: +/- 14.0 / 2024.8.0b0 | 11.5%
CI: +/- 4.9 / 2024.8.0b | +| mistral-v3 | 3.8%
CI: +/- 4.2 / 2024.8.0b | 2.0%
CI: +/- 4.0 / 2024.8.0dev | 10.3%
CI: +/- 4.6 / 2024.8.0b | +| xlam-1b | | 27.1%
CI: +/- 12.6 / 2024.8.0b0 | | Implementation notes: - CI is large given small number of samples in the datasets.