empirical-org · dandrabik · Jan 8, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/...p/bundles/Staff/components/evidence/llmPromptDatasets/individualDataset/trialsSection.tsx b/...p/bundles/Staff/components/evidence/llmPromptDatasets/individualDataset/trialsSection.tsx
@@ -104,6 +104,14 @@ const TrialsSection = ({ trials, datasetPath, }: { trials: TrialInterface[], dat
       rowSectionClassName: 'center-content allow-wrap',
       noTooltip: true
     },
+    {
+      name: 'Feedback Error Rate',
+      attribute: 'feedbackErrorRate',
+      width: '64px',
+      headerClassName: 'center-content',
+      rowSectionClassName: 'center-content allow-wrap',
+      noTooltip: true
+    },
     {
       name: 'LLM',
       attribute: 'llmVersion',
@@ -151,7 +159,7 @@ const TrialsSection = ({ trials, datasetPath, }: { trials: TrialInterface[], dat
   ]
 
   const rows = () => trials.map(trial => {
-    const { number, created_at, temperature, optimal_correct, optimal_count, suboptimal_correct, suboptimal_count, average_g_eval_score, status, id, notes, llm_version, llm_prompt, } = trial
+    const { number, created_at, temperature, optimal_correct, optimal_count, suboptimal_correct, suboptimal_count, average_g_eval_score, status, id, notes, llm_version, llm_prompt, evaluator_failure_count, evaluator_total_count } = trial
     const { name, optimal_examples_count, suboptimal_examples_count, guidelines_count, } = llm_prompt
 
     let compareCheckbox = <button aria-label="Unchecked checkbox" className="quill-checkbox unselected" onClick={() => toggleTrialSelection(id)} type="button" />
@@ -173,6 +181,7 @@ const TrialsSection = ({ trials, datasetPath, }: { trials: TrialInterface[], dat
       guidelinesCount: guidelines_count,
       optimalAccuracy: percentAccuracy(optimal_correct, optimal_count),
       suboptimalAccuracy: percentAccuracy(suboptimal_correct, suboptimal_count),
+      feedbackErrorRate: evaluator_failure_count === null ? null : percentAccuracy(evaluator_failure_count, evaluator_total_count),
       llmVersion: llm_version,
       averageGEvalScore: average_g_eval_score,
       notes,

diff --git a/...bundles/Staff/components/evidence/llmPromptDatasets/individualTrial/evaluationSection.tsx b/...bundles/Staff/components/evidence/llmPromptDatasets/individualTrial/evaluationSection.tsx
@@ -3,7 +3,7 @@ import * as React from "react";
 import { DataTable, Spinner, DropdownInput, LightButtonLoadingSpinner, } from '../../../../../Shared/index';
 import { useQuery, useQueryClient } from 'react-query';
 
-import { llmExampleMatchColor, matched, ALL, MATCHED, NOT_MATCHED, CHECKED, filterOptions, } from '../shared/sharedFunctions'
+import { llmExampleMatchColor, matched, ALL, MATCHED, NOT_MATCHED, CHECKED, DARK_RED, filterOptions, } from '../shared/sharedFunctions'
 import { LLMExampleInterface, DatasetInterface, TrialInterface, } from '../../../../interfaces/evidenceInterfaces';
 import { CLASSIFICATION, } from '../../../../../../constants/evidence'
 import { fetchLLMExamplesForTrial, createDataSubset, } from '../../../../utils/evidence/genAIAPIs';
@@ -16,6 +16,7 @@ const CURRICULUM_FEEDBACK = 'curriculumFeedback'
 const LLM_FEEDBACK = 'llmFeedback'
 const CURRICULUM_LABEL = 'curriculumLabel'
 const LLM_LABEL = 'llmLabel'
+const FEEDBACK_ERRORS = 'feedbackErrors'
 
 const attributeToWidth = {
   [MATCH]: '40px',
@@ -25,6 +26,8 @@ const attributeToWidth = {
   [STUDENT_RESPONSE]: '300px',
   [CURRICULUM_LABEL]: '300px',
   [LLM_LABEL]: '300px',
+  [FEEDBACK_ERRORS]: '100px',
+
 }
 
 const matchHeader = {
@@ -81,6 +84,13 @@ const generativeTableHeaders = [
     rowSectionClassName: 'allow-wrap',
     noTooltip: true
   },
+  {
+    name: 'Feedback Errors',
+    attribute: FEEDBACK_ERRORS,
+    width: attributeToWidth[FEEDBACK_ERRORS],
+    rowSectionClassName: 'allow-wrap',
+    noTooltip: true
+  },
 ]
 
 // we are passing the datasetId and trialId props as required, and the dataset and trial as optional, because we want to start loading the `LLMExamples` at the same time as the records for the trial itself
@@ -175,7 +185,7 @@ const EvaluationSection = ({ dataset, trial, datasetId, trialId, }: EvaluationSe
   function handleSetFilter(e) { setFilter(e.value) }
 
   const llmExampleRows = () => filteredLLMExamples().map((llmExample, index) => {
-    const { test_example, test_example_id, llm_feedback, rag_label, } = llmExample
+    const { test_example, test_example_id, llm_feedback, rag_label, feedback_errors, } = llmExample
     const { student_response, curriculum_proposed_feedback, } = test_example
 
     const gEval = trial.results.g_evals && Object.keys(trial.results.g_evals).map((gEvalId) => parseInt(trial.results.g_evals[gEvalId][index]) || '')
@@ -186,6 +196,7 @@ const EvaluationSection = ({ dataset, trial, datasetId, trialId, }: EvaluationSe
       gEval,
       curriculumFeedback: curriculum_proposed_feedback,
       llmFeedback: llm_feedback,
+      feedbackErrors: <div className={DARK_RED}>{feedback_errors}</div>,
       studentResponse: student_response,
       curriculumLabel: test_example.rag_label,
       llmLabel: rag_label,

diff --git a/services/QuillLMS/client/app/bundles/Staff/interfaces/evidenceInterfaces.ts b/services/QuillLMS/client/app/bundles/Staff/interfaces/evidenceInterfaces.ts
@@ -211,6 +211,8 @@ export interface TrialInterface {
   optimal_count: number;
   suboptimal_correct: number;
   suboptimal_count: number;
+  evaluator_failure_count?: number;
+  evaluator_total_count?: number;
   average_g_eval_score: number;
   llm_version: number;
   llm_prompt?: LLMPromptInterface;

diff --git a/...uillLMS/db/migrate/20250130191426_create_research_gen_ai_feedback_evaluations.evidence.rb b/...uillLMS/db/migrate/20250130191426_create_research_gen_ai_feedback_evaluations.evidence.rb
@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+
+# This migration comes from evidence (originally 20250123202558)
+class CreateResearchGenAIFeedbackEvaluations < ActiveRecord::Migration[7.1]
+  def change
+    create_table :evidence_research_gen_ai_feedback_evaluations do |t|
+      t.integer :trial_id, null: false
+      t.integer :version, null: false
+      t.integer :llm_example_id, null: false
+      t.boolean :optimal, null: false
+      t.string  :errored_checks, array: true, default: []
+
+      t.timestamps
+    end
+
+    add_index :evidence_research_gen_ai_feedback_evaluations, :trial_id
+    add_index :evidence_research_gen_ai_feedback_evaluations, :llm_example_id
+  end
+end
diff --git a/services/QuillLMS/db/structure.sql b/services/QuillLMS/db/structure.sql
@@ -145,7 +145,7 @@ CREATE FUNCTION public.timespent_question(act_sess integer, question character v
           item timestamp;
         BEGIN
           SELECT created_at INTO as_created_at FROM activity_sessions WHERE id = act_sess;
-          
+
           -- backward compatibility block
           IF as_created_at IS NULL OR as_created_at < timestamp '2013-08-25 00:00:00.000000' THEN
             SELECT SUM(
@@ -160,11 +160,11 @@ CREATE FUNCTION public.timespent_question(act_sess integer, question character v
                       'epoch' FROM (activity_sessions.completed_at - activity_sessions.started_at)
                     )
                 END) INTO time_spent FROM activity_sessions WHERE id = act_sess AND state='finished';
-                
+
                 RETURN COALESCE(time_spent,0);
           END IF;
-          
-          
+
+
           first_item := NULL;
           last_item := NULL;
           max_item := NULL;
@@ -188,11 +188,11 @@ CREATE FUNCTION public.timespent_question(act_sess integer, question character v
 
             END IF;
           END LOOP;
-          
+
           IF max_item IS NOT NULL AND first_item IS NOT NULL THEN
             time_spent := time_spent + EXTRACT( EPOCH FROM max_item - first_item );
           END IF;
-          
+
           RETURN time_spent;
         END;
       $$;
@@ -207,7 +207,7 @@ CREATE FUNCTION public.timespent_student(student integer) RETURNS bigint
     AS $$
         SELECT COALESCE(SUM(time_spent),0) FROM (
           SELECT id,timespent_activity_session(id) AS time_spent FROM activity_sessions
-          WHERE activity_sessions.user_id = student 
+          WHERE activity_sessions.user_id = student
           GROUP BY id) as as_ids;
 
       $$;
@@ -2791,6 +2791,41 @@ CREATE SEQUENCE public.evidence_automl_models_id_seq
 ALTER SEQUENCE public.evidence_automl_models_id_seq OWNED BY public.evidence_automl_models.id;
 
 
+--
+-- Name: evidence_gen_ai_feedback_evaluation; Type: TABLE; Schema: public; Owner: -
+--
+
+CREATE TABLE public.evidence_gen_ai_feedback_evaluation (
+    id bigint NOT NULL,
+    trial_id integer NOT NULL,
+    version integer NOT NULL,
+    llm_example_id integer NOT NULL,
+    optimal boolean NOT NULL,
+    rules character varying[] DEFAULT '{}'::character varying[],
+    created_at timestamp(6) without time zone NOT NULL,
+    updated_at timestamp(6) without time zone NOT NULL
+);
+
+
+--
+-- Name: evidence_gen_ai_feedback_evaluation_id_seq; Type: SEQUENCE; Schema: public; Owner: -
+--
+
+CREATE SEQUENCE public.evidence_gen_ai_feedback_evaluation_id_seq
+    START WITH 1
+    INCREMENT BY 1
+    NO MINVALUE
+    NO MAXVALUE
+    CACHE 1;
+
+
+--
+-- Name: evidence_gen_ai_feedback_evaluation_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
+--
+
+ALTER SEQUENCE public.evidence_gen_ai_feedback_evaluation_id_seq OWNED BY public.evidence_gen_ai_feedback_evaluation.id;
+
+
 --
 -- Name: evidence_gen_ai_highlight_groups; Type: TABLE; Schema: public; Owner: -
 --
@@ -3243,6 +3278,41 @@ CREATE SEQUENCE public.evidence_research_gen_ai_datasets_id_seq
 ALTER SEQUENCE public.evidence_research_gen_ai_datasets_id_seq OWNED BY public.evidence_research_gen_ai_datasets.id;
 
 
+--
+-- Name: evidence_research_gen_ai_feedback_evaluations; Type: TABLE; Schema: public; Owner: -
+--
+
+CREATE TABLE public.evidence_research_gen_ai_feedback_evaluations (
+    id bigint NOT NULL,
+    trial_id integer NOT NULL,
+    version integer NOT NULL,
+    llm_example_id integer NOT NULL,
+    optimal boolean NOT NULL,
+    errored_checks character varying[] DEFAULT '{}'::character varying[],
+    created_at timestamp(6) without time zone NOT NULL,
+    updated_at timestamp(6) without time zone NOT NULL
+);
+
+
+--
+-- Name: evidence_research_gen_ai_feedback_evaluations_id_seq; Type: SEQUENCE; Schema: public; Owner: -
+--
+
+CREATE SEQUENCE public.evidence_research_gen_ai_feedback_evaluations_id_seq
+    START WITH 1
+    INCREMENT BY 1
+    NO MINVALUE
+    NO MAXVALUE
+    CACHE 1;
+
+
+--
+-- Name: evidence_research_gen_ai_feedback_evaluations_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
+--
+
+ALTER SEQUENCE public.evidence_research_gen_ai_feedback_evaluations_id_seq OWNED BY public.evidence_research_gen_ai_feedback_evaluations.id;
+
+
 --
 -- Name: evidence_research_gen_ai_g_eval_scores; Type: TABLE; Schema: public; Owner: -
 --
@@ -7253,6 +7323,13 @@ ALTER TABLE ONLY public.evidence_research_gen_ai_dataset_relevant_texts ALTER CO
 ALTER TABLE ONLY public.evidence_research_gen_ai_datasets ALTER COLUMN id SET DEFAULT nextval('public.evidence_research_gen_ai_datasets_id_seq'::regclass);
 
 
+--
+-- Name: evidence_research_gen_ai_feedback_evaluations id; Type: DEFAULT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.evidence_research_gen_ai_feedback_evaluations ALTER COLUMN id SET DEFAULT nextval('public.evidence_research_gen_ai_feedback_evaluations_id_seq'::regclass);
+
+
 --
 -- Name: evidence_research_gen_ai_g_eval_scores id; Type: DEFAULT; Schema: public; Owner: -
 --
@@ -8619,6 +8696,14 @@ ALTER TABLE ONLY public.evidence_research_gen_ai_datasets
     ADD CONSTRAINT evidence_research_gen_ai_datasets_pkey PRIMARY KEY (id);
 
 
+--
+-- Name: evidence_research_gen_ai_feedback_evaluations evidence_research_gen_ai_feedback_evaluations_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.evidence_research_gen_ai_feedback_evaluations
+    ADD CONSTRAINT evidence_research_gen_ai_feedback_evaluations_pkey PRIMARY KEY (id);
+
+
 --
 -- Name: evidence_research_gen_ai_g_eval_scores evidence_research_gen_ai_g_eval_scores_pkey; Type: CONSTRAINT; Schema: public; Owner: -
 --
@@ -9468,6 +9553,13 @@ CREATE UNIQUE INDEX feedback_history_ratings_uniqueness ON public.feedback_histo
 CREATE INDEX idx_on_classroom_unit_id_8502333889 ON public.student_learning_sequence_activities USING btree (classroom_unit_id);
 
 
+--
+-- Name: idx_on_llm_example_id_774518eafd; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_on_llm_example_id_774518eafd ON public.evidence_research_gen_ai_feedback_evaluations USING btree (llm_example_id);
+
+
 --
 -- Name: idx_on_student_learning_sequence_id_63827699e9; Type: INDEX; Schema: public; Owner: -
 --
@@ -9482,6 +9574,13 @@ CREATE INDEX idx_on_student_learning_sequence_id_63827699e9 ON public.student_le
 CREATE UNIQUE INDEX idx_on_student_learning_sequence_id_classroom_unit__84e420e79d ON public.student_learning_sequence_activities USING btree (student_learning_sequence_id, classroom_unit_id, activity_id);
 
 
+--
+-- Name: idx_on_trial_id_eee55ce93c; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_on_trial_id_eee55ce93c ON public.evidence_research_gen_ai_feedback_evaluations USING btree (trial_id);
+
+
 --
 -- Name: idx_on_user_id_initial_classroom_unit_id_initial_ac_17357537ed; Type: INDEX; Schema: public; Owner: -
 --
@@ -11976,6 +12075,8 @@ ALTER TABLE ONLY public.learn_worlds_account_course_events
 SET search_path TO "$user", public;
 
 INSERT INTO "schema_migrations" (version) VALUES
+('20250130191426'),
+('20250109151117'),
 ('20250109202521'),
 ('20241211155124'),
 ('20241211152410'),

diff --git a/...lLMS/engines/evidence/app/controllers/evidence/research/gen_ai/llm_examples_controller.rb b/...lLMS/engines/evidence/app/controllers/evidence/research/gen_ai/llm_examples_controller.rb
@@ -5,7 +5,7 @@ module Research
     module GenAI
       class LLMExamplesController < ApplicationController
         def index
-          render json: { llm_examples: trial.llm_examples.order(:id) }
+          render json: { llm_examples: trial.llm_examples.includes(:feedback_evaluation).order(:id) }
         end
 
         private def trial = @trial ||= Trial.find(params[:trial_id])

diff --git a/...ices/QuillLMS/engines/evidence/app/models/evidence/research/gen_ai/feedback_evaluation.rb b/...ices/QuillLMS/engines/evidence/app/models/evidence/research/gen_ai/feedback_evaluation.rb
@@ -0,0 +1,44 @@
+# frozen_string_literal: true
+
+# == Schema Information
+#
+# Table name: evidence_research_gen_ai_feedback_evaluations
+#
+#  id             :bigint           not null, primary key
+#  errored_checks :string           default([]), is an Array
+#  optimal        :boolean          not null
+#  version        :integer          not null
+#  created_at     :datetime         not null
+#  updated_at     :datetime         not null
+#  llm_example_id :integer          not null
+#  trial_id       :integer          not null
+#
+# Indexes
+#
+#  idx_on_llm_example_id_774518eafd  (llm_example_id)
+#  idx_on_trial_id_eee55ce93c        (trial_id)
+#
+module Evidence
+  module Research
+    module GenAI
+      class FeedbackEvaluation < ApplicationRecord
+        belongs_to :trial
+        belongs_to :llm_example
+
+        attr_readonly :trial_id, :version, :llm_example_id, :optimal, :errors
+
+        validates :trial_id, :llm_example_id, presence: true
+        validates :optimal, inclusion: { in: [true, false] }
+
+        scope :suboptimal, -> { where(optimal: false) }
+
+        def serializable_hash(options = nil)
+          options ||= {}
+          super(options.reverse_merge(
+            only: [:optimal, :errored_checks, :version]
+          ))
+        end
+      end
+    end
+  end
+end
diff --git a/services/QuillLMS/engines/evidence/app/models/evidence/research/gen_ai/llm_example.rb b/services/QuillLMS/engines/evidence/app/models/evidence/research/gen_ai/llm_example.rb
@@ -23,6 +23,9 @@ class LLMExample < ApplicationRecord
         belongs_to :trial
         belongs_to :test_example
 
+        has_many :feedback_evaluations
+        has_one :feedback_evaluation, -> { order(created_at: :desc) }, class_name: 'FeedbackEvaluation'
+
         validates :raw_text, presence: true
         validates :llm_feedback, presence: true
         validates :test_example_id, presence: true
@@ -36,8 +39,8 @@ class LLMExample < ApplicationRecord
         def serializable_hash(options = nil)
           options ||= {}
           super(options.reverse_merge(
-            include: [:test_example],
-            methods: [:optimal_or_suboptimal_match?, :optimal?, :suboptimal?, :rag_label]
+            include: [:test_example, :feedback_evaluation],
+            methods: [:optimal_or_suboptimal_match?, :optimal?, :suboptimal?, :rag_label, :feedback_errors]
           ))
         end
 
@@ -51,6 +54,8 @@ def suboptimal_match? = test_suboptimal? && suboptimal?
         def test_optimal? = test_example.optimal?
         def test_suboptimal? = test_example.suboptimal?
 
+        def feedback_errors = feedback_evaluation&.errored_checks&.join(', ')
+
         def rag_label
           return llm_feedback if llm_feedback.start_with?('Label_')
           return 'Optimal' if llm_feedback.start_with?('Optimal')