Working version of error stats on trial index view.

empirical-org · dandrabik · Feb 5, 2025 · Jan 8, 2025 · Jan 10, 2025 · Jan 10, 2025
commit fc2ac784702d2d27dbe6f41d32120dce3c09bce3
diff --git a/...p/bundles/Staff/components/evidence/llmPromptDatasets/individualDataset/trialsSection.tsx b/...p/bundles/Staff/components/evidence/llmPromptDatasets/individualDataset/trialsSection.tsx
@@ -104,6 +104,14 @@ const TrialsSection = ({ trials, datasetPath, }: { trials: TrialInterface[], dat
       rowSectionClassName: 'center-content allow-wrap',
       noTooltip: true
     },
+    {
+      name: 'Feedback Error Rate',
+      attribute: 'feedbackErrorRate',
+      width: '64px',
+      headerClassName: 'center-content',
+      rowSectionClassName: 'center-content allow-wrap',
+      noTooltip: true
+    },
     {
       name: 'LLM',
       attribute: 'llmVersion',
@@ -151,7 +159,7 @@ const TrialsSection = ({ trials, datasetPath, }: { trials: TrialInterface[], dat
   ]
 
   const rows = () => trials.map(trial => {
-    const { number, created_at, temperature, optimal_correct, optimal_count, suboptimal_correct, suboptimal_count, average_g_eval_score, status, id, notes, llm_version, llm_prompt, } = trial
+    const { number, created_at, temperature, optimal_correct, optimal_count, suboptimal_correct, suboptimal_count, average_g_eval_score, status, id, notes, llm_version, llm_prompt, evaluator_failure_count, evaluator_total_count } = trial
     const { name, optimal_examples_count, suboptimal_examples_count, guidelines_count, } = llm_prompt
 
     let compareCheckbox = <button aria-label="Unchecked checkbox" className="quill-checkbox unselected" onClick={() => toggleTrialSelection(id)} type="button" />
@@ -173,6 +181,7 @@ const TrialsSection = ({ trials, datasetPath, }: { trials: TrialInterface[], dat
       guidelinesCount: guidelines_count,
       optimalAccuracy: percentAccuracy(optimal_correct, optimal_count),
       suboptimalAccuracy: percentAccuracy(suboptimal_correct, suboptimal_count),
+      feedbackErrorRate: evaluator_failure_count === null ? null : percentAccuracy(evaluator_failure_count, evaluator_total_count),
       llmVersion: llm_version,
       averageGEvalScore: average_g_eval_score,
       notes,

diff --git a/services/QuillLMS/client/app/bundles/Staff/interfaces/evidenceInterfaces.ts b/services/QuillLMS/client/app/bundles/Staff/interfaces/evidenceInterfaces.ts
@@ -211,6 +211,8 @@ export interface TrialInterface {
   optimal_count: number;
   suboptimal_correct: number;
   suboptimal_count: number;
+  evaluator_failure_count?: number;
+  evaluator_total_count?: number;
   average_g_eval_score: number;
   llm_version: number;
   llm_prompt?: LLMPromptInterface;

diff --git a/services/QuillLMS/engines/evidence/app/models/evidence/research/gen_ai/trial.rb b/services/QuillLMS/engines/evidence/app/models/evidence/research/gen_ai/trial.rb
@@ -85,7 +85,7 @@ def serializable_hash(options = nil)
           options ||= {}
           super(options.reverse_merge(
             include: [:llm_prompt],
-            methods: [:average_g_eval_score, :optimal_correct, :optimal_count, :suboptimal_correct, :suboptimal_count, :llm_version, :vendor, :test_examples_count]
+            methods: [:average_g_eval_score, :optimal_correct, :optimal_count, :suboptimal_correct, :suboptimal_count, :llm_version, :vendor, :test_examples_count, :evaluator_failure_count, :evaluator_total_count]
           ))
         end
 

diff --git a/services/QuillLMS/engines/evidence/app/views/evidence/research/gen_ai/datasets/show.html.erb b/services/QuillLMS/engines/evidence/app/views/evidence/research/gen_ai/datasets/show.html.erb
@@ -88,6 +88,7 @@
             <th>Optimal Accuracy</th>
             <th>Suboptimal Accuracy</th>
             <th>Weighted Accuracy</th>
+            <th>Feedback Error Rate</th>
             <th>Model</th>
             <% if @dataset.generative? %>
               <th>GEval Average</th>
@@ -119,6 +120,7 @@
               <td><%= percent_accuracy(trial.optimal_correct, trial.optimal_count) %></td>
               <td><%= percent_accuracy(trial.suboptimal_correct, trial.suboptimal_count) %></td>
               <td><%= trial.weighted_accuracy&.round(5) %></td>
+              <td><%= percent_accuracy(trial.evaluator_failure_count, trial.evaluator_total_count) if trial.evaluator_failure_count %></td>
               <td><%= trial.llm.version %></td>
               <% if @dataset.generative? %>
                 <td><%= trial.average_g_eval_score %></td>