Update CI format

allenporter · Aug 6, 2024 · 5602c72 · 5602c72
1 parent 2b6804c
commit 5602c72
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 19 deletions.
diff --git a/home_assistant_datasets/tool/leaderboard/build.py b/home_assistant_datasets/tool/leaderboard/build.py
@@ -149,19 +149,15 @@ def create_leaderboard_table(
         assert best_model_scores[first_model_id][dataset]
         num_samples = best_model_scores[first_model_id][dataset].total
         cols.append(f"{dataset} (n={num_samples})")
-        cols.append("95% CI / version")
     rows = []
     for model_id, dataset_scores in best_model_scores.items():
         row = [ model_id ]
         for dataset, best_record in dataset_scores.items():
             if best_record.good_percent_value() != 0:
-                row.append(
-                    f"{best_record.good_percent_value()*100:0.1f}%")
                 ci = 1.96 * best_record.stddev*100
-                row.append(f"+/- {ci:0.1f} / {best_record.dataset_label}")
+                row.append(f"{best_record.good_percent_value()*100:0.1f}%<br>CI: +/- {ci:0.1f} / {best_record.dataset_label}")
             else:
                 row.append("")
-                row.append("")
         rows.append(row)
     return table.table(cols, rows)
 

diff --git a/reports/README.md b/reports/README.md
@@ -1,18 +1,18 @@
 # Home LLM Leaderboard
-| Model | assist (n=80) | 95% CI / version | assist-mini (n=49) | 95% CI / version | intents (n=165) | 95% CI / version |
-| --- | --- | --- | --- | --- | --- | --- |
-| gemini-1.5-flash | 91.2% | +/- 6.2 / 2024.6.3 | 98.0% | +/- 4.0 / 2024.8.0dev | 63.0% | +/- 7.4 / 2024.8.0b |
-| gpt-4o-mini | 90.0% | +/- 6.6 / 2024.8.0b | 98.0% | +/- 4.0 / 2024.8.0dev | 63.6% | +/- 7.3 / 2024.8.0b |
-| gpt-4o | 87.5% | +/- 7.2 / 2024.6.3 |  |  | 81.2% | +/- 6.0 / 2024.6.3 |
-| gpt-3.5 | 75.0% | +/- 9.5 / 2024.6.3 |  |  | 67.9% | +/- 7.1 / 2024.6.3 |
-| functionary-small-v2.5 | 56.2% | +/- 10.9 / 2024.7.0 | 63.3% | +/- 13.5 / 2024.8.0dev | 37.6% | +/- 7.4 / 2024.6.3 |
-| llama3.1 | 45.6% | +/- 11.0 / 2024.8.0b | 83.7% | +/- 10.3 / 2024.8.0b0 | 22.6% | +/- 6.4 / 2024.8.0b |
-| home-llm | 45.0% | +/- 10.9 / 2024.6.3 | 34.7% | +/- 13.3 / 2024.8.0dev | 25.5% | +/- 6.6 / 2024.6.3 |
-| assistant | 37.5% | +/- 10.6 / 2024.6.3 | 63.3% | +/- 13.5 / 2024.8.0dev | 98.8% | +/- 1.7 / 2024.6.3 |
-| xlam-7b | 25.0% | +/- 9.5 / 2024.8.0b | 85.7% | +/- 9.8 / 2024.8.0b0 |  |  |
-| llama3-groq-tool-use | 20.0% | +/- 8.8 / 2024.8.0b | 51.0% | +/- 14.0 / 2024.8.0b0 | 11.5% | +/- 4.9 / 2024.8.0b |
-| mistral-v3 | 3.8% | +/- 4.2 / 2024.8.0b | 2.0% | +/- 4.0 / 2024.8.0dev | 10.3% | +/- 4.6 / 2024.8.0b |
-| xlam-1b |  |  | 27.1% | +/- 12.6 / 2024.8.0b0 |  |  |
+| Model | assist (n=80) | assist-mini (n=49) | intents (n=165) |
+| --- | --- | --- | --- |
+| gemini-1.5-flash | 91.2%<br>CI: +/- 6.2 / 2024.6.3 | 98.0%<br>CI: +/- 4.0 / 2024.8.0dev | 63.0%<br>CI: +/- 7.4 / 2024.8.0b |
+| gpt-4o-mini | 90.0%<br>CI: +/- 6.6 / 2024.8.0b | 98.0%<br>CI: +/- 4.0 / 2024.8.0dev | 63.6%<br>CI: +/- 7.3 / 2024.8.0b |
+| gpt-4o | 87.5%<br>CI: +/- 7.2 / 2024.6.3 |  | 81.2%<br>CI: +/- 6.0 / 2024.6.3 |
+| gpt-3.5 | 75.0%<br>CI: +/- 9.5 / 2024.6.3 |  | 67.9%<br>CI: +/- 7.1 / 2024.6.3 |
+| functionary-small-v2.5 | 56.2%<br>CI: +/- 10.9 / 2024.7.0 | 63.3%<br>CI: +/- 13.5 / 2024.8.0dev | 37.6%<br>CI: +/- 7.4 / 2024.6.3 |
+| llama3.1 | 45.6%<br>CI: +/- 11.0 / 2024.8.0b | 83.7%<br>CI: +/- 10.3 / 2024.8.0b0 | 22.6%<br>CI: +/- 6.4 / 2024.8.0b |
+| home-llm | 45.0%<br>CI: +/- 10.9 / 2024.6.3 | 34.7%<br>CI: +/- 13.3 / 2024.8.0dev | 25.5%<br>CI: +/- 6.6 / 2024.6.3 |
+| assistant | 37.5%<br>CI: +/- 10.6 / 2024.6.3 | 63.3%<br>CI: +/- 13.5 / 2024.8.0dev | 98.8%<br>CI: +/- 1.7 / 2024.6.3 |
+| xlam-7b | 25.0%<br>CI: +/- 9.5 / 2024.8.0b | 85.7%<br>CI: +/- 9.8 / 2024.8.0b0 |  |
+| llama3-groq-tool-use | 20.0%<br>CI: +/- 8.8 / 2024.8.0b | 51.0%<br>CI: +/- 14.0 / 2024.8.0b0 | 11.5%<br>CI: +/- 4.9 / 2024.8.0b |
+| mistral-v3 | 3.8%<br>CI: +/- 4.2 / 2024.8.0b | 2.0%<br>CI: +/- 4.0 / 2024.8.0dev | 10.3%<br>CI: +/- 4.6 / 2024.8.0b |
+| xlam-1b |  | 27.1%<br>CI: +/- 12.6 / 2024.8.0b0 |  |
 
 Implementation notes:
 - CI is large given small number of samples in the datasets.