From 57ecce3c0ff40ce2a9a7c1f4dbdcc23213acacc8 Mon Sep 17 00:00:00 2001 From: brendanshean Date: Thu, 26 Sep 2024 13:19:57 -0700 Subject: [PATCH] Add LabeledEntry model (#12422) * Add neighbor gem to main app * Add LabeledEntry model * Dan suggestions --- services/QuillLMS/Gemfile | 1 + services/QuillLMS/Gemfile.lock | 1 + ...5185730_create_labeled_entries.evidence.rb | 22 ++ services/QuillLMS/db/structure.sql | 67 ++++++ .../app/models/evidence/labeled_entry.rb | 77 +++++++ .../20240925184213_create_labeled_entries.rb | 19 ++ .../evidence/spec/dummy/db/structure.sql | 59 ++++++ .../factories/evidence/labeled_entries.rb | 33 +++ .../models/evidence/labeled_entry_spec.rb | 194 ++++++++++++++++++ 9 files changed, 473 insertions(+) create mode 100644 services/QuillLMS/db/migrate/20240925185730_create_labeled_entries.evidence.rb create mode 100644 services/QuillLMS/engines/evidence/app/models/evidence/labeled_entry.rb create mode 100644 services/QuillLMS/engines/evidence/db/migrate/20240925184213_create_labeled_entries.rb create mode 100644 services/QuillLMS/engines/evidence/spec/factories/evidence/labeled_entries.rb create mode 100644 services/QuillLMS/engines/evidence/spec/models/evidence/labeled_entry_spec.rb diff --git a/services/QuillLMS/Gemfile b/services/QuillLMS/Gemfile index 82f1118fe2..ece7feada1 100644 --- a/services/QuillLMS/Gemfile +++ b/services/QuillLMS/Gemfile @@ -15,6 +15,7 @@ gem 'dotenv-rails', '~> 2.6' gem 'ancestry', '~> 3.0.5' gem 'atomic_arrays', '~> 1.1.0' gem 'bulk_insert', '~> 1.7' +gem 'neighbor' gem 'pg', '1.4.2' gem 'rails_admin', '~> 3.1.2' gem 'ranked-model', '~> 0.4.3' diff --git a/services/QuillLMS/Gemfile.lock b/services/QuillLMS/Gemfile.lock index a2093ac367..c6f665aa9f 100644 --- a/services/QuillLMS/Gemfile.lock +++ b/services/QuillLMS/Gemfile.lock @@ -943,6 +943,7 @@ DEPENDENCIES maxminddb memory_profiler mini_racer (= 0.8.0) + neighbor newrelic_rpm (~> 9.3.1) nokogiri (>= 1.13.2) omniauth diff --git a/services/QuillLMS/db/migrate/20240925185730_create_labeled_entries.evidence.rb b/services/QuillLMS/db/migrate/20240925185730_create_labeled_entries.evidence.rb new file mode 100644 index 0000000000..777d6870b7 --- /dev/null +++ b/services/QuillLMS/db/migrate/20240925185730_create_labeled_entries.evidence.rb @@ -0,0 +1,22 @@ +# frozen_string_literal: true + +# This migration comes from evidence (originally 20240925184213) +require 'neighbor' +class CreateLabeledEntries < ActiveRecord::Migration[7.1] + def change + create_table :evidence_labeled_entries do |t| + t.boolean :approved + t.text :entry, null: false + t.text :label, null: false + t.text :label_transformed, null: false + t.jsonb :metadata + t.integer :prompt_id, null: false + t.vector :embedding, limit: 1536, null: false + + t.timestamps + end + + add_index :evidence_labeled_entries, :prompt_id + add_index :evidence_labeled_entries, [:prompt_id, :entry], unique: true + end +end diff --git a/services/QuillLMS/db/structure.sql b/services/QuillLMS/db/structure.sql index 9ae6236145..0cb8b30674 100644 --- a/services/QuillLMS/db/structure.sql +++ b/services/QuillLMS/db/structure.sql @@ -2826,6 +2826,43 @@ CREATE SEQUENCE public.evidence_hints_id_seq ALTER SEQUENCE public.evidence_hints_id_seq OWNED BY public.evidence_hints.id; +-- +-- Name: evidence_labeled_entries; Type: TABLE; Schema: public; Owner: - +-- + +CREATE TABLE public.evidence_labeled_entries ( + id bigint NOT NULL, + approved boolean, + entry text NOT NULL, + label text NOT NULL, + label_transformed text NOT NULL, + metadata jsonb, + prompt_id integer NOT NULL, + embedding public.vector(1536) NOT NULL, + created_at timestamp(6) without time zone NOT NULL, + updated_at timestamp(6) without time zone NOT NULL +); + + +-- +-- Name: evidence_labeled_entries_id_seq; Type: SEQUENCE; Schema: public; Owner: - +-- + +CREATE SEQUENCE public.evidence_labeled_entries_id_seq + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1; + + +-- +-- Name: evidence_labeled_entries_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: - +-- + +ALTER SEQUENCE public.evidence_labeled_entries_id_seq OWNED BY public.evidence_labeled_entries.id; + + -- -- Name: evidence_prompt_healths; Type: TABLE; Schema: public; Owner: - -- @@ -6998,6 +7035,13 @@ ALTER TABLE ONLY public.evidence_automl_models ALTER COLUMN id SET DEFAULT nextv ALTER TABLE ONLY public.evidence_hints ALTER COLUMN id SET DEFAULT nextval('public.evidence_hints_id_seq'::regclass); +-- +-- Name: evidence_labeled_entries id; Type: DEFAULT; Schema: public; Owner: - +-- + +ALTER TABLE ONLY public.evidence_labeled_entries ALTER COLUMN id SET DEFAULT nextval('public.evidence_labeled_entries_id_seq'::regclass); + + -- -- Name: evidence_prompt_healths id; Type: DEFAULT; Schema: public; Owner: - -- @@ -8317,6 +8361,14 @@ ALTER TABLE ONLY public.evidence_hints ADD CONSTRAINT evidence_hints_pkey PRIMARY KEY (id); +-- +-- Name: evidence_labeled_entries evidence_labeled_entries_pkey; Type: CONSTRAINT; Schema: public; Owner: - +-- + +ALTER TABLE ONLY public.evidence_labeled_entries + ADD CONSTRAINT evidence_labeled_entries_pkey PRIMARY KEY (id); + + -- -- Name: evidence_prompt_healths evidence_prompt_healths_pkey; Type: CONSTRAINT; Schema: public; Owner: - -- @@ -9922,6 +9974,20 @@ CREATE INDEX index_evidence_automl_models_on_prompt_id ON public.evidence_automl CREATE INDEX index_evidence_hints_on_rule_id ON public.evidence_hints USING btree (rule_id); +-- +-- Name: index_evidence_labeled_entries_on_prompt_id; Type: INDEX; Schema: public; Owner: - +-- + +CREATE INDEX index_evidence_labeled_entries_on_prompt_id ON public.evidence_labeled_entries USING btree (prompt_id); + + +-- +-- Name: index_evidence_labeled_entries_on_prompt_id_and_entry; Type: INDEX; Schema: public; Owner: - +-- + +CREATE UNIQUE INDEX index_evidence_labeled_entries_on_prompt_id_and_entry ON public.evidence_labeled_entries USING btree (prompt_id, entry); + + -- -- Name: index_evidence_prompt_healths_on_evidence_activity_health_id; Type: INDEX; Schema: public; Owner: - -- @@ -11660,6 +11726,7 @@ ALTER TABLE ONLY public.learn_worlds_account_course_events SET search_path TO "$user", public; INSERT INTO "schema_migrations" (version) VALUES +('20240925185730'), ('20240924151321'), ('20240924151311'), ('20240918144926'), diff --git a/services/QuillLMS/engines/evidence/app/models/evidence/labeled_entry.rb b/services/QuillLMS/engines/evidence/app/models/evidence/labeled_entry.rb new file mode 100644 index 0000000000..30aab6f9a2 --- /dev/null +++ b/services/QuillLMS/engines/evidence/app/models/evidence/labeled_entry.rb @@ -0,0 +1,77 @@ +# frozen_string_literal: true + +# == Schema Information +# +# Table name: evidence_labeled_entries +# +# id :bigint not null, primary key +# approved :boolean +# embedding :vector(1536) not null +# entry :text not null +# label :text not null +# label_transformed :text not null +# metadata :jsonb +# created_at :datetime not null +# updated_at :datetime not null +# prompt_id :integer not null +# +# Indexes +# +# index_evidence_labeled_entries_on_prompt_id (prompt_id) +# index_evidence_labeled_entries_on_prompt_id_and_entry (prompt_id,entry) UNIQUE +# + +require 'neighbor' + +module Evidence + class LabeledEntry < ApplicationRecord + # Dimension and model are coupled: https://platform.openai.com/docs/guides/embeddings + DIMENSION = 1536 + MODEL = 'text-embedding-3-small' + + DISTANCE_METRIC = 'cosine' + COLLAPSED_OPTIMAL_LABEL = 'Optimal' + + belongs_to :prompt + + has_neighbors :embedding + + validates :embedding, presence: true + validates :label, presence: true + validates :label_transformed, presence: true + validates :prompt, presence: true + validates :entry, presence: true + + before_validation :set_embedding, :set_transformed_label, :set_entry + + def nearest_neighbor + nearest_neighbors(:embedding, distance: DISTANCE_METRIC) + .where(prompt_id:) + .first + end + + def nearest_label + val = nearest_neighbor + + { distance: val&.neighbor_distance, label: val&.label } + end + + private def set_entry + self.entry = entry.strip if entry.present? + end + + private def set_embedding + return if entry.blank? || embedding.present? + + self.embedding = Evidence::OpenAI::EmbeddingFetcher.run(dimension: DIMENSION, input: entry, model: MODEL) + end + + private def set_transformed_label + if label.present? && label.match?(/\AOptimal_\d+\z/) + self.label_transformed = COLLAPSED_OPTIMAL_LABEL + else + self.label_transformed = label + end + end + end +end diff --git a/services/QuillLMS/engines/evidence/db/migrate/20240925184213_create_labeled_entries.rb b/services/QuillLMS/engines/evidence/db/migrate/20240925184213_create_labeled_entries.rb new file mode 100644 index 0000000000..df656f6dc2 --- /dev/null +++ b/services/QuillLMS/engines/evidence/db/migrate/20240925184213_create_labeled_entries.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +require 'neighbor' +class CreateLabeledEntries < ActiveRecord::Migration[7.1] + def change + create_table :evidence_labeled_entries do |t| + t.text :entry, null: false + t.text :label, null: false + t.text :label_transformed, null: false + t.jsonb :metadata + t.integer :prompt_id, null: false + t.vector :embedding, limit: 1536, null: false + + t.timestamps + end + + add_index :evidence_labeled_entries, :prompt_id + end +end diff --git a/services/QuillLMS/engines/evidence/spec/dummy/db/structure.sql b/services/QuillLMS/engines/evidence/spec/dummy/db/structure.sql index 3a7d98a852..ffec790e1b 100644 --- a/services/QuillLMS/engines/evidence/spec/dummy/db/structure.sql +++ b/services/QuillLMS/engines/evidence/spec/dummy/db/structure.sql @@ -718,6 +718,42 @@ CREATE SEQUENCE public.evidence_hints_id_seq ALTER SEQUENCE public.evidence_hints_id_seq OWNED BY public.evidence_hints.id; +-- +-- Name: evidence_labeled_entries; Type: TABLE; Schema: public; Owner: - +-- + +CREATE TABLE public.evidence_labeled_entries ( + id bigint NOT NULL, + entry text NOT NULL, + label text NOT NULL, + label_transformed text NOT NULL, + metadata jsonb, + prompt_id integer NOT NULL, + embedding public.vector(1536) NOT NULL, + created_at timestamp(6) without time zone NOT NULL, + updated_at timestamp(6) without time zone NOT NULL +); + + +-- +-- Name: evidence_labeled_entries_id_seq; Type: SEQUENCE; Schema: public; Owner: - +-- + +CREATE SEQUENCE public.evidence_labeled_entries_id_seq + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1; + + +-- +-- Name: evidence_labeled_entries_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: - +-- + +ALTER SEQUENCE public.evidence_labeled_entries_id_seq OWNED BY public.evidence_labeled_entries.id; + + -- -- Name: evidence_prompt_healths; Type: TABLE; Schema: public; Owner: - -- @@ -1760,6 +1796,13 @@ ALTER TABLE ONLY public.evidence_automl_models ALTER COLUMN id SET DEFAULT nextv ALTER TABLE ONLY public.evidence_hints ALTER COLUMN id SET DEFAULT nextval('public.evidence_hints_id_seq'::regclass); +-- +-- Name: evidence_labeled_entries id; Type: DEFAULT; Schema: public; Owner: - +-- + +ALTER TABLE ONLY public.evidence_labeled_entries ALTER COLUMN id SET DEFAULT nextval('public.evidence_labeled_entries_id_seq'::regclass); + + -- -- Name: evidence_prompt_healths id; Type: DEFAULT; Schema: public; Owner: - -- @@ -2102,6 +2145,14 @@ ALTER TABLE ONLY public.evidence_hints ADD CONSTRAINT evidence_hints_pkey PRIMARY KEY (id); +-- +-- Name: evidence_labeled_entries evidence_labeled_entries_pkey; Type: CONSTRAINT; Schema: public; Owner: - +-- + +ALTER TABLE ONLY public.evidence_labeled_entries + ADD CONSTRAINT evidence_labeled_entries_pkey PRIMARY KEY (id); + + -- -- Name: evidence_prompt_healths evidence_prompt_healths_pkey; Type: CONSTRAINT; Schema: public; Owner: - -- @@ -2444,6 +2495,13 @@ CREATE INDEX index_evidence_automl_models_on_prompt_id ON public.evidence_automl CREATE INDEX index_evidence_hints_on_rule_id ON public.evidence_hints USING btree (rule_id); +-- +-- Name: index_evidence_labeled_entries_on_prompt_id; Type: INDEX; Schema: public; Owner: - +-- + +CREATE INDEX index_evidence_labeled_entries_on_prompt_id ON public.evidence_labeled_entries USING btree (prompt_id); + + -- -- Name: index_evidence_prompt_healths_on_evidence_activity_health_id; Type: INDEX; Schema: public; Owner: - -- @@ -2514,6 +2572,7 @@ ALTER TABLE ONLY public.comprehension_regex_rules SET search_path TO "$user", public; INSERT INTO "schema_migrations" (version) VALUES +('20240925184213'), ('20240918144745'), ('20240828221309'), ('20240823204315'), diff --git a/services/QuillLMS/engines/evidence/spec/factories/evidence/labeled_entries.rb b/services/QuillLMS/engines/evidence/spec/factories/evidence/labeled_entries.rb new file mode 100644 index 0000000000..9054e1a2cb --- /dev/null +++ b/services/QuillLMS/engines/evidence/spec/factories/evidence/labeled_entries.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +# == Schema Information +# +# Table name: evidence_labeled_entries +# +# id :bigint not null, primary key +# approved :boolean +# embedding :vector(1536) not null +# entry :text not null +# label :text not null +# label_transformed :text not null +# metadata :jsonb +# created_at :datetime not null +# updated_at :datetime not null +# prompt_id :integer not null +# +# Indexes +# +# index_evidence_labeled_entries_on_prompt_id (prompt_id) +# index_evidence_labeled_entries_on_prompt_id_and_entry (prompt_id,entry) UNIQUE +# + +FactoryBot.define do + factory :evidence_labeled_entry, class: 'Evidence::LabeledEntry' do + entry { Faker::Lorem.sentence } + embedding { Array.new(Evidence::LabeledEntry::DIMENSION) { rand(-1.0..1.0) } } + label { "Label_#{rand(0..10)}" } + label_transformed { label } + + association :prompt, factory: :evidence_prompt + end +end diff --git a/services/QuillLMS/engines/evidence/spec/models/evidence/labeled_entry_spec.rb b/services/QuillLMS/engines/evidence/spec/models/evidence/labeled_entry_spec.rb new file mode 100644 index 0000000000..4e3d6cb0bc --- /dev/null +++ b/services/QuillLMS/engines/evidence/spec/models/evidence/labeled_entry_spec.rb @@ -0,0 +1,194 @@ +# frozen_string_literal: true + +# == Schema Information +# +# Table name: evidence_labeled_entries +# +# id :bigint not null, primary key +# approved :boolean +# embedding :vector(1536) not null +# entry :text not null +# label :text not null +# label_transformed :text not null +# metadata :jsonb +# created_at :datetime not null +# updated_at :datetime not null +# prompt_id :integer not null +# +# Indexes +# +# index_evidence_labeled_entries_on_prompt_id (prompt_id) +# index_evidence_labeled_entries_on_prompt_id_and_entry (prompt_id,entry) UNIQUE +# + +require 'rails_helper' + +module Evidence + RSpec.describe LabeledEntry do + let(:factory) { described_class.model_name.singular.to_sym } + let(:dimension) { described_class::DIMENSION } + let(:embedding) { Array.new(dimension) { rand(-1.0..1.0) } } + + it { is_expected.to validate_presence_of(:prompt) } + it { is_expected.to validate_presence_of(:entry) } + it { is_expected.to validate_presence_of(:embedding) } + it { is_expected.to validate_presence_of(:label) } + it { is_expected.to validate_presence_of(:label_transformed) } + + context 'validations' do + context 'label_transformed' do + subject { create(factory, label:).label_transformed } + + context 'when label matches "Optimal_1' do + let(:label) { 'Optimal_1' } + + it { is_expected.to eq described_class::COLLAPSED_OPTIMAL_LABEL } + end + + context 'when label matches "Optimal_10"' do + let(:label) { 'Optimal_10' } + + it { is_expected.to eq described_class::COLLAPSED_OPTIMAL_LABEL } + end + + context 'when label does not match "Optimal_n"' do + let(:label) { 'Label_5' } + + it { is_expected.to eq label } + end + + context 'when label is nil' do + let(:label) { nil } + + it { expect { subject }.to raise_error(ActiveRecord::RecordInvalid) } + end + end + + context 'entry' do + subject { labeled_entry.entry } + + let(:entry) { ' some spaces before and after ' } + let(:labeled_entry) { create(factory, entry:) } + + it { is_expected.to eq entry.strip } + end + end + + context 'with stubbed embedding' do + subject { build(factory, entry:, embedding: initial_embedding) } + + let(:entry) { 'sample text' } + let(:initial_embedding) { nil } + let(:fetcher_class) { Evidence::OpenAI::EmbeddingFetcher } + + before { allow(fetcher_class).to receive(:run).and_return(embedding) } + + context 'when text is present and embedding is nil' do + it 'sets the embedding' do + subject.validate + expect(subject.embedding).to eq embedding + end + end + + context 'when text is present and embedding is already set' do + let(:initial_embedding) { embedding } + + it 'does not change the existing embedding' do + subject.validate + expect(subject.embedding).to eq initial_embedding + expect(fetcher_class).not_to receive(:run) + end + end + + context 'when text is nil' do + let(:entry) { nil } + + it 'does not set the embedding' do + subject.validate + expect(subject.embedding).to be_nil + expect(fetcher_class).not_to receive(:run) + end + end + end + + context '#nearest_neighbor' do + subject { labeled_entry.nearest_neighbor } + + let(:epsilon) { 0.01 } + let(:embedding_plus_epsilon) { embedding.map { |value| value + epsilon } } + let(:embedding_plus_two_epsilon) { embedding.map { |value| value + (2 * epsilon) } } + + context 'with no other LabeledEntry records' do + let(:labeled_entry) { create(factory) } + + it { is_expected.to eq nil } + end + + context 'with other LabeledEntry records but for different prompt' do + let(:labeled_entry) { create(factory) } + + before { create(factory) } + + it { is_expected.to eq nil } + end + + context 'with one other LabeledEntry' do + let!(:labeled_entry1) { create(factory, prompt:, embedding: embedding_plus_epsilon) } + + let(:prompt) { create(:evidence_prompt) } + let(:labeled_entry) { create(factory, prompt:, embedding:) } + + it { is_expected.to eq labeled_entry1 } + end + + context 'with multiple other LabeledEntry records' do + let!(:labeled_entry1) { create(factory, prompt:, embedding: embedding_plus_epsilon) } + let!(:labeled_entry2) { create(factory, prompt:, embedding: embedding_plus_two_epsilon) } + + let(:prompt) { create(:evidence_prompt) } + let(:labeled_entry) { create(factory, prompt:, embedding:) } + + it { is_expected.to eq labeled_entry1 } + end + end + + context '#nearest_label' do + subject { labeled_entry.nearest_label } + + let(:prompt) { create(:evidence_prompt) } + let(:labeled_entry) { create(factory, prompt:) } + + context 'when no neighbors' do + it { is_expected.to eq(distance: nil, label: nil) } + end + + context 'with other LabeledEntry records' do + let!(:existing_prompt_response_label) { create(factory, prompt:) } + + it { expect(subject[:distance]).to be_a(Float) } + it { expect(subject[:label]).to eq existing_prompt_response_label.label } + end + end + + context 'benchmarking', :benchmarking do + let(:num_iterations) { 1000 } + let(:labeled_entries) { create_list(factory, num_iterations) } + + it 'checks for performance' do + [].tap do |times| + labeled_entries.each do |labeled_entry| + times << Benchmark.realtime { labeled_entry.nearest_neighbors(:embedding, distance: :cosine).first(5) } + end + + mean_time = times.reduce(:+) / times.size + stddev = Math.sqrt(times.map { |time| (time - mean_time)**2 }.reduce(:+) / times.size) + + puts "\nBenchmarking for nearest neighbor cosine similarity" + puts "Model: #{klass::MODEL}, Dimension: #{klass::DIMENSION}, num_iterations: #{num_iterations}" + puts "Average response time: #{mean_time} seconds" + puts "Standard deviation: #{stddev} seconds" + end + end + end + end +end