Better UI for small datasets, or datasets with no variation. Defaulti…

…ng to 0 wasn't sound.
Kiln-AI · Feb 28, 2025 · 9e31b8c · 9e31b8c
1 parent 7f19ffe
commit 9e31b8c
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 34 deletions.
diff --git a/app/desktop/studio_server/correlation_calculator.py b/app/desktop/studio_server/correlation_calculator.py
@@ -19,9 +19,9 @@ class CorrelationResult:
     mean_normalized_absolute_error: float
     mean_squared_error: float
     mean_normalized_squared_error: float
-    spearman_correlation: float
-    pearson_correlation: float
-    kendalltau_correlation: float
+    spearman_correlation: float | None
+    pearson_correlation: float | None
+    kendalltau_correlation: float | None
 
 
 class CorrelationCalculator:
@@ -71,40 +71,40 @@ def calculate_mean_normalized_squared_error(self) -> float:
         )
         return total_normalized_squared_error / len(self.scores)
 
-    def calculate_spearman_correlation(self) -> float:
+    def calculate_spearman_correlation(self) -> float | None:
         if len(self.scores) < 2:
-            # If there is only one pair, return 0 = no correlation
-            return 0
+            # If there is only one pair, no correlation
+            return None
         x = [score.measured_score for score in self.scores]
         y = [score.human_score for score in self.scores]
         result = stats.spearmanr(x, y)
         # library doesn't support proper types
         correlation = result.__getattribute__("correlation")
         if math.isnan(correlation) or not isinstance(correlation, float):
             # Very small samples may have a NaN result (unknown correlation)
-            return 0
+            return None
         return correlation
 
-    def calculate_pearson_correlation(self) -> float:
+    def calculate_pearson_correlation(self) -> float | None:
         if len(self.scores) < 2:
-            # If there is only one pair, return 0 = no correlation
-            return 0
+            # If there is only one pair,  no correlation
+            return None
         x = [score.measured_score for score in self.scores]
         y = [score.human_score for score in self.scores]
         result = stats.pearsonr(x, y)
         if math.isnan(result.correlation):
             # Very small samples may have a NaN result (unknown correlation)
-            return 0
+            return None
         return result.correlation
 
-    def calculate_kendalltau_correlation(self) -> float:
+    def calculate_kendalltau_correlation(self) -> float | None:
         if len(self.scores) < 2:
-            # If there is only one pair, return 0 = no correlation
-            return 0
+            # If there is only one pair, no correlation
+            return None
         x = [score.measured_score for score in self.scores]
         y = [score.human_score for score in self.scores]
         result = stats.kendalltau(x, y)
         if math.isnan(result.correlation):
             # Very small samples may have a NaN result (unknown correlation)
-            return 0
+            return None
         return result.correlation
diff --git a/app/desktop/studio_server/test_correlation_calculator.py b/app/desktop/studio_server/test_correlation_calculator.py
@@ -194,9 +194,9 @@ def test_single_data_point(self, single_data_point):
         assert result.mean_normalized_absolute_error == 0.0
         assert result.mean_squared_error == 0.0
         assert result.mean_normalized_squared_error == 0.0
-        assert result.spearman_correlation == 0.0
-        assert result.pearson_correlation == 0.0
-        assert result.kendalltau_correlation == 0.0
+        assert result.spearman_correlation is None
+        assert result.pearson_correlation is None
+        assert result.kendalltau_correlation is None
 
     def test_two_data_points(self, two_data_points):
         """Test correlation calculations with two data points"""

diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py
@@ -925,18 +925,18 @@ class EvalCondigSummaryTestData:
             "mean_absolute_error": 4.0,  # error 4.0
             "mean_normalized_squared_error": 1,  # max error: 1 v 5
             "mean_normalized_absolute_error": 1,  # max error: 1 v 5
-            "spearman_correlation": 0,  # default value for 1 pair
-            "pearson_correlation": 0,
-            "kendalltau_correlation": 0,
+            "spearman_correlation": None,  # Not enough data
+            "pearson_correlation": None,
+            "kendalltau_correlation": None,
         },
         "score1": {
             "mean_squared_error": 2.25,  # error (3.5-5.0)^2
             "mean_absolute_error": 1.5,  # error 1.5
             "mean_normalized_squared_error": 0.140625,  # hand calc
             "mean_normalized_absolute_error": 0.375,  # 1.5/4
-            "spearman_correlation": 0,  # default value for 1 pair
-            "pearson_correlation": 0,
-            "kendalltau_correlation": 0,
+            "spearman_correlation": None,  # Not enough data
+            "pearson_correlation": None,  # Not enough data
+            "kendalltau_correlation": None,  # Not enough data
         },
     }
     # 1 of total_in_dataset eval configs are are in ec1 test
@@ -949,9 +949,9 @@ class EvalCondigSummaryTestData:
             "mean_absolute_error": 1.5,  # (1+2)/2
             "mean_normalized_squared_error": 0.15625,  # (0.25^2 + 0.5^2) / 2
             "mean_normalized_absolute_error": 0.375,  # (0.25 + 0.5) / 2
-            "spearman_correlation": 0,
-            "pearson_correlation": 0,
-            "kendalltau_correlation": 0,
+            "spearman_correlation": None,
+            "pearson_correlation": None,
+            "kendalltau_correlation": None,
         },
         "score1": {
             "mean_squared_error": 2.5,  # (1^2+2^2)/2
@@ -973,9 +973,9 @@ class EvalCondigSummaryTestData:
             "mean_absolute_error": 2,
             "mean_normalized_squared_error": 0.25,
             "mean_normalized_absolute_error": 0.5,
-            "spearman_correlation": 0,
-            "pearson_correlation": 0,
-            "kendalltau_correlation": 0,
+            "spearman_correlation": None,
+            "pearson_correlation": None,
+            "kendalltau_correlation": None,
         },
     }
     # 2 of total_in_dataset eval configs are are in ec2 test

diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte
@@ -518,11 +518,17 @@
                         {:else if score_type === "norm_mae"}
                           {scores.mean_normalized_absolute_error.toFixed(3)}
                         {:else if score_type === "spearman"}
-                          {scores.spearman_correlation.toFixed(3)}
+                          {scores.spearman_correlation
+                            ? scores.spearman_correlation.toFixed(3)
+                            : "N/A"}
                         {:else if score_type === "pearson"}
-                          {scores.pearson_correlation.toFixed(3)}
+                          {scores.pearson_correlation
+                            ? scores.pearson_correlation.toFixed(3)
+                            : "N/A"}
                         {:else if score_type === "kendalltau"}
-                          {scores.kendalltau_correlation.toFixed(3)}
+                          {scores.kendalltau_correlation
+                            ? scores.kendalltau_correlation.toFixed(3)
+                            : "N/A"}
                         {/if}
                       {:else}
                         unknown
@@ -593,7 +599,8 @@
     These are three scientific correlation coefficients. For all three, The
     value tends to be high (close to 1) for samples with a strongly positive
     correlation, low (close to -1) for samples with a strongly negative
-    correlation, and close to zero for samples with weak correlation.
+    correlation, and close to zero for samples with weak correlation. Scores may
+    be 'N/A' if there are too few samples or not enough variation in scores.
   </div>
   <ul class="list-disc text-sm text-gray-500 pl-5 pt-2">
     <li>