diff --git a/home_assistant_datasets/tool/leaderboard/build.py b/home_assistant_datasets/tool/leaderboard/build.py
index 5f214bc7..dd243b78 100644
--- a/home_assistant_datasets/tool/leaderboard/build.py
+++ b/home_assistant_datasets/tool/leaderboard/build.py
@@ -155,7 +155,7 @@ def create_leaderboard_table(
for dataset, best_record in dataset_scores.items():
if best_record.good_percent_value() != 0:
ci = 1.96 * best_record.stddev*100
- row.append(f"{best_record.good_percent_value()*100:0.1f}% CI: {ci:0.1f}% {best_record.dataset_label}")
+ row.append(f"{best_record.good_percent_value()*100:0.1f}% (CI: {ci:0.1f}%, {best_record.dataset_label})")
else:
row.append("")
rows.append(row)
diff --git a/reports/README.md b/reports/README.md
index 548c17f6..7ddae542 100644
--- a/reports/README.md
+++ b/reports/README.md
@@ -1,18 +1,18 @@
# Home LLM Leaderboard
| Model | assist (n=80) | assist-mini (n=49) | intents (n=165) |
| --- | --- | --- | --- |
-| gemini-1.5-flash | 91.2% CI: 6.2% 2024.6.3 | 98.0% CI: 4.0% 2024.8.0dev | 63.0% CI: 7.4% 2024.8.0b |
-| gpt-4o-mini | 90.0% CI: 6.6% 2024.8.0b | 98.0% CI: 4.0% 2024.8.0dev | 63.6% CI: 7.3% 2024.8.0b |
-| gpt-4o | 87.5% CI: 7.2% 2024.6.3 | | 81.2% CI: 6.0% 2024.6.3 |
-| gpt-3.5 | 75.0% CI: 9.5% 2024.6.3 | | 67.9% CI: 7.1% 2024.6.3 |
-| functionary-small-v2.5 | 56.2% CI: 10.9% 2024.7.0 | 63.3% CI: 13.5% 2024.8.0dev | 37.6% CI: 7.4% 2024.6.3 |
-| llama3.1 | 45.6% CI: 11.0% 2024.8.0b | 83.7% CI: 10.3% 2024.8.0b0 | 22.6% CI: 6.4% 2024.8.0b |
-| home-llm | 45.0% CI: 10.9% 2024.6.3 | 34.7% CI: 13.3% 2024.8.0dev | 25.5% CI: 6.6% 2024.6.3 |
-| assistant | 37.5% CI: 10.6% 2024.6.3 | 63.3% CI: 13.5% 2024.8.0dev | 98.8% CI: 1.7% 2024.6.3 |
-| xlam-7b | 25.0% CI: 9.5% 2024.8.0b | 85.7% CI: 9.8% 2024.8.0b0 | |
-| llama3-groq-tool-use | 20.0% CI: 8.8% 2024.8.0b | 51.0% CI: 14.0% 2024.8.0b0 | 11.5% CI: 4.9% 2024.8.0b |
-| mistral-v3 | 3.8% CI: 4.2% 2024.8.0b | 2.0% CI: 4.0% 2024.8.0dev | 10.3% CI: 4.6% 2024.8.0b |
-| xlam-1b | | 27.1% CI: 12.6% 2024.8.0b0 | |
+| gemini-1.5-flash | 91.2% (CI: 6.2%, 2024.6.3) | 98.0% (CI: 4.0%, 2024.8.0dev) | 63.0% (CI: 7.4%, 2024.8.0b) |
+| gpt-4o-mini | 90.0% (CI: 6.6%, 2024.8.0b) | 98.0% (CI: 4.0%, 2024.8.0dev) | 63.6% (CI: 7.3%, 2024.8.0b) |
+| gpt-4o | 87.5% (CI: 7.2%, 2024.6.3) | | 81.2% (CI: 6.0%, 2024.6.3) |
+| gpt-3.5 | 75.0% (CI: 9.5%, 2024.6.3) | | 67.9% (CI: 7.1%, 2024.6.3) |
+| functionary-small-v2.5 | 56.2% (CI: 10.9%, 2024.7.0) | 63.3% (CI: 13.5%, 2024.8.0dev) | 37.6% (CI: 7.4%, 2024.6.3) |
+| llama3.1 | 45.6% (CI: 11.0%, 2024.8.0b) | 83.7% (CI: 10.3%, 2024.8.0b0) | 22.6% (CI: 6.4%, 2024.8.0b) |
+| home-llm | 45.0% (CI: 10.9%, 2024.6.3) | 34.7% (CI: 13.3%, 2024.8.0dev) | 25.5% (CI: 6.6%, 2024.6.3) |
+| assistant | 37.5% (CI: 10.6%, 2024.6.3) | 63.3% (CI: 13.5%, 2024.8.0dev) | 98.8% (CI: 1.7%, 2024.6.3) |
+| xlam-7b | 25.0% (CI: 9.5%, 2024.8.0b) | 85.7% (CI: 9.8%, 2024.8.0b0) | |
+| llama3-groq-tool-use | 20.0% (CI: 8.8%, 2024.8.0b) | 51.0% (CI: 14.0%, 2024.8.0b0) | 11.5% (CI: 4.9%, 2024.8.0b) |
+| mistral-v3 | 3.8% (CI: 4.2%, 2024.8.0b) | 2.0% (CI: 4.0%, 2024.8.0dev) | 10.3% (CI: 4.6%, 2024.8.0b) |
+| xlam-1b | | 27.1% (CI: 12.6%, 2024.8.0b0) | |
Implementation notes:
- CI is large given small number of samples in the datasets.