diff --git a/home_assistant_datasets/tool/leaderboard/build.py b/home_assistant_datasets/tool/leaderboard/build.py index 5f214bc7..dd243b78 100644 --- a/home_assistant_datasets/tool/leaderboard/build.py +++ b/home_assistant_datasets/tool/leaderboard/build.py @@ -155,7 +155,7 @@ def create_leaderboard_table( for dataset, best_record in dataset_scores.items(): if best_record.good_percent_value() != 0: ci = 1.96 * best_record.stddev*100 - row.append(f"{best_record.good_percent_value()*100:0.1f}% CI: {ci:0.1f}% {best_record.dataset_label}") + row.append(f"{best_record.good_percent_value()*100:0.1f}% (CI: {ci:0.1f}%, {best_record.dataset_label})") else: row.append("") rows.append(row) diff --git a/reports/README.md b/reports/README.md index 548c17f6..7ddae542 100644 --- a/reports/README.md +++ b/reports/README.md @@ -1,18 +1,18 @@ # Home LLM Leaderboard | Model | assist (n=80) | assist-mini (n=49) | intents (n=165) | | --- | --- | --- | --- | -| gemini-1.5-flash | 91.2% CI: 6.2% 2024.6.3 | 98.0% CI: 4.0% 2024.8.0dev | 63.0% CI: 7.4% 2024.8.0b | -| gpt-4o-mini | 90.0% CI: 6.6% 2024.8.0b | 98.0% CI: 4.0% 2024.8.0dev | 63.6% CI: 7.3% 2024.8.0b | -| gpt-4o | 87.5% CI: 7.2% 2024.6.3 | | 81.2% CI: 6.0% 2024.6.3 | -| gpt-3.5 | 75.0% CI: 9.5% 2024.6.3 | | 67.9% CI: 7.1% 2024.6.3 | -| functionary-small-v2.5 | 56.2% CI: 10.9% 2024.7.0 | 63.3% CI: 13.5% 2024.8.0dev | 37.6% CI: 7.4% 2024.6.3 | -| llama3.1 | 45.6% CI: 11.0% 2024.8.0b | 83.7% CI: 10.3% 2024.8.0b0 | 22.6% CI: 6.4% 2024.8.0b | -| home-llm | 45.0% CI: 10.9% 2024.6.3 | 34.7% CI: 13.3% 2024.8.0dev | 25.5% CI: 6.6% 2024.6.3 | -| assistant | 37.5% CI: 10.6% 2024.6.3 | 63.3% CI: 13.5% 2024.8.0dev | 98.8% CI: 1.7% 2024.6.3 | -| xlam-7b | 25.0% CI: 9.5% 2024.8.0b | 85.7% CI: 9.8% 2024.8.0b0 | | -| llama3-groq-tool-use | 20.0% CI: 8.8% 2024.8.0b | 51.0% CI: 14.0% 2024.8.0b0 | 11.5% CI: 4.9% 2024.8.0b | -| mistral-v3 | 3.8% CI: 4.2% 2024.8.0b | 2.0% CI: 4.0% 2024.8.0dev | 10.3% CI: 4.6% 2024.8.0b | -| xlam-1b | | 27.1% CI: 12.6% 2024.8.0b0 | | +| gemini-1.5-flash | 91.2% (CI: 6.2%, 2024.6.3) | 98.0% (CI: 4.0%, 2024.8.0dev) | 63.0% (CI: 7.4%, 2024.8.0b) | +| gpt-4o-mini | 90.0% (CI: 6.6%, 2024.8.0b) | 98.0% (CI: 4.0%, 2024.8.0dev) | 63.6% (CI: 7.3%, 2024.8.0b) | +| gpt-4o | 87.5% (CI: 7.2%, 2024.6.3) | | 81.2% (CI: 6.0%, 2024.6.3) | +| gpt-3.5 | 75.0% (CI: 9.5%, 2024.6.3) | | 67.9% (CI: 7.1%, 2024.6.3) | +| functionary-small-v2.5 | 56.2% (CI: 10.9%, 2024.7.0) | 63.3% (CI: 13.5%, 2024.8.0dev) | 37.6% (CI: 7.4%, 2024.6.3) | +| llama3.1 | 45.6% (CI: 11.0%, 2024.8.0b) | 83.7% (CI: 10.3%, 2024.8.0b0) | 22.6% (CI: 6.4%, 2024.8.0b) | +| home-llm | 45.0% (CI: 10.9%, 2024.6.3) | 34.7% (CI: 13.3%, 2024.8.0dev) | 25.5% (CI: 6.6%, 2024.6.3) | +| assistant | 37.5% (CI: 10.6%, 2024.6.3) | 63.3% (CI: 13.5%, 2024.8.0dev) | 98.8% (CI: 1.7%, 2024.6.3) | +| xlam-7b | 25.0% (CI: 9.5%, 2024.8.0b) | 85.7% (CI: 9.8%, 2024.8.0b0) | | +| llama3-groq-tool-use | 20.0% (CI: 8.8%, 2024.8.0b) | 51.0% (CI: 14.0%, 2024.8.0b0) | 11.5% (CI: 4.9%, 2024.8.0b) | +| mistral-v3 | 3.8% (CI: 4.2%, 2024.8.0b) | 2.0% (CI: 4.0%, 2024.8.0dev) | 10.3% (CI: 4.6%, 2024.8.0b) | +| xlam-1b | | 27.1% (CI: 12.6%, 2024.8.0b0) | | Implementation notes: - CI is large given small number of samples in the datasets.