Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LabeledEntry model (#12422) #12427

Merged
merged 1 commit into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions services/QuillLMS/Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ gem 'dotenv-rails', '~> 2.6'
gem 'ancestry', '~> 3.0.5'
gem 'atomic_arrays', '~> 1.1.0'
gem 'bulk_insert', '~> 1.7'
gem 'neighbor'
gem 'pg', '1.4.2'
gem 'rails_admin', '~> 3.1.2'
gem 'ranked-model', '~> 0.4.3'
Expand Down
1 change: 1 addition & 0 deletions services/QuillLMS/Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,7 @@ DEPENDENCIES
maxminddb
memory_profiler
mini_racer (= 0.8.0)
neighbor
newrelic_rpm (~> 9.3.1)
nokogiri (>= 1.13.2)
omniauth
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# frozen_string_literal: true

# This migration comes from evidence (originally 20240925184213)
require 'neighbor'
class CreateLabeledEntries < ActiveRecord::Migration[7.1]
def change
create_table :evidence_labeled_entries do |t|
t.boolean :approved
t.text :entry, null: false
t.text :label, null: false
t.text :label_transformed, null: false
t.jsonb :metadata
t.integer :prompt_id, null: false
t.vector :embedding, limit: 1536, null: false

t.timestamps
end

add_index :evidence_labeled_entries, :prompt_id
add_index :evidence_labeled_entries, [:prompt_id, :entry], unique: true
end
end
67 changes: 67 additions & 0 deletions services/QuillLMS/db/structure.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2826,6 +2826,43 @@ CREATE SEQUENCE public.evidence_hints_id_seq
ALTER SEQUENCE public.evidence_hints_id_seq OWNED BY public.evidence_hints.id;


--
-- Name: evidence_labeled_entries; Type: TABLE; Schema: public; Owner: -
--

CREATE TABLE public.evidence_labeled_entries (
id bigint NOT NULL,
approved boolean,
entry text NOT NULL,
label text NOT NULL,
label_transformed text NOT NULL,
metadata jsonb,
prompt_id integer NOT NULL,
embedding public.vector(1536) NOT NULL,
created_at timestamp(6) without time zone NOT NULL,
updated_at timestamp(6) without time zone NOT NULL
);


--
-- Name: evidence_labeled_entries_id_seq; Type: SEQUENCE; Schema: public; Owner: -
--

CREATE SEQUENCE public.evidence_labeled_entries_id_seq
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;


--
-- Name: evidence_labeled_entries_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
--

ALTER SEQUENCE public.evidence_labeled_entries_id_seq OWNED BY public.evidence_labeled_entries.id;


--
-- Name: evidence_prompt_healths; Type: TABLE; Schema: public; Owner: -
--
Expand Down Expand Up @@ -6998,6 +7035,13 @@ ALTER TABLE ONLY public.evidence_automl_models ALTER COLUMN id SET DEFAULT nextv
ALTER TABLE ONLY public.evidence_hints ALTER COLUMN id SET DEFAULT nextval('public.evidence_hints_id_seq'::regclass);


--
-- Name: evidence_labeled_entries id; Type: DEFAULT; Schema: public; Owner: -
--

ALTER TABLE ONLY public.evidence_labeled_entries ALTER COLUMN id SET DEFAULT nextval('public.evidence_labeled_entries_id_seq'::regclass);


--
-- Name: evidence_prompt_healths id; Type: DEFAULT; Schema: public; Owner: -
--
Expand Down Expand Up @@ -8317,6 +8361,14 @@ ALTER TABLE ONLY public.evidence_hints
ADD CONSTRAINT evidence_hints_pkey PRIMARY KEY (id);


--
-- Name: evidence_labeled_entries evidence_labeled_entries_pkey; Type: CONSTRAINT; Schema: public; Owner: -
--

ALTER TABLE ONLY public.evidence_labeled_entries
ADD CONSTRAINT evidence_labeled_entries_pkey PRIMARY KEY (id);


--
-- Name: evidence_prompt_healths evidence_prompt_healths_pkey; Type: CONSTRAINT; Schema: public; Owner: -
--
Expand Down Expand Up @@ -9922,6 +9974,20 @@ CREATE INDEX index_evidence_automl_models_on_prompt_id ON public.evidence_automl
CREATE INDEX index_evidence_hints_on_rule_id ON public.evidence_hints USING btree (rule_id);


--
-- Name: index_evidence_labeled_entries_on_prompt_id; Type: INDEX; Schema: public; Owner: -
--

CREATE INDEX index_evidence_labeled_entries_on_prompt_id ON public.evidence_labeled_entries USING btree (prompt_id);


--
-- Name: index_evidence_labeled_entries_on_prompt_id_and_entry; Type: INDEX; Schema: public; Owner: -
--

CREATE UNIQUE INDEX index_evidence_labeled_entries_on_prompt_id_and_entry ON public.evidence_labeled_entries USING btree (prompt_id, entry);


--
-- Name: index_evidence_prompt_healths_on_evidence_activity_health_id; Type: INDEX; Schema: public; Owner: -
--
Expand Down Expand Up @@ -11660,6 +11726,7 @@ ALTER TABLE ONLY public.learn_worlds_account_course_events
SET search_path TO "$user", public;

INSERT INTO "schema_migrations" (version) VALUES
('20240925185730'),
('20240924151321'),
('20240924151311'),
('20240918144926'),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# frozen_string_literal: true

# == Schema Information
#
# Table name: evidence_labeled_entries
#
# id :bigint not null, primary key
# approved :boolean
# embedding :vector(1536) not null
# entry :text not null
# label :text not null
# label_transformed :text not null
# metadata :jsonb
# created_at :datetime not null
# updated_at :datetime not null
# prompt_id :integer not null
#
# Indexes
#
# index_evidence_labeled_entries_on_prompt_id (prompt_id)
# index_evidence_labeled_entries_on_prompt_id_and_entry (prompt_id,entry) UNIQUE
#

require 'neighbor'

module Evidence
class LabeledEntry < ApplicationRecord
# Dimension and model are coupled: https://platform.openai.com/docs/guides/embeddings
DIMENSION = 1536
MODEL = 'text-embedding-3-small'

DISTANCE_METRIC = 'cosine'
COLLAPSED_OPTIMAL_LABEL = 'Optimal'

belongs_to :prompt

has_neighbors :embedding

validates :embedding, presence: true
validates :label, presence: true
validates :label_transformed, presence: true
validates :prompt, presence: true
validates :entry, presence: true

before_validation :set_embedding, :set_transformed_label, :set_entry

def nearest_neighbor
nearest_neighbors(:embedding, distance: DISTANCE_METRIC)
.where(prompt_id:)
.first
end

def nearest_label
val = nearest_neighbor

{ distance: val&.neighbor_distance, label: val&.label }
end

private def set_entry
self.entry = entry.strip if entry.present?
end

private def set_embedding
return if entry.blank? || embedding.present?

self.embedding = Evidence::OpenAI::EmbeddingFetcher.run(dimension: DIMENSION, input: entry, model: MODEL)
end

private def set_transformed_label
if label.present? && label.match?(/\AOptimal_\d+\z/)
self.label_transformed = COLLAPSED_OPTIMAL_LABEL
else
self.label_transformed = label
end
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# frozen_string_literal: true

require 'neighbor'
class CreateLabeledEntries < ActiveRecord::Migration[7.1]
def change
create_table :evidence_labeled_entries do |t|
t.text :entry, null: false
t.text :label, null: false
t.text :label_transformed, null: false
t.jsonb :metadata
t.integer :prompt_id, null: false
t.vector :embedding, limit: 1536, null: false

t.timestamps
end

add_index :evidence_labeled_entries, :prompt_id
end
end
59 changes: 59 additions & 0 deletions services/QuillLMS/engines/evidence/spec/dummy/db/structure.sql
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,42 @@ CREATE SEQUENCE public.evidence_hints_id_seq
ALTER SEQUENCE public.evidence_hints_id_seq OWNED BY public.evidence_hints.id;


--
-- Name: evidence_labeled_entries; Type: TABLE; Schema: public; Owner: -
--

CREATE TABLE public.evidence_labeled_entries (
id bigint NOT NULL,
entry text NOT NULL,
label text NOT NULL,
label_transformed text NOT NULL,
metadata jsonb,
prompt_id integer NOT NULL,
embedding public.vector(1536) NOT NULL,
created_at timestamp(6) without time zone NOT NULL,
updated_at timestamp(6) without time zone NOT NULL
);


--
-- Name: evidence_labeled_entries_id_seq; Type: SEQUENCE; Schema: public; Owner: -
--

CREATE SEQUENCE public.evidence_labeled_entries_id_seq
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;


--
-- Name: evidence_labeled_entries_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
--

ALTER SEQUENCE public.evidence_labeled_entries_id_seq OWNED BY public.evidence_labeled_entries.id;


--
-- Name: evidence_prompt_healths; Type: TABLE; Schema: public; Owner: -
--
Expand Down Expand Up @@ -1760,6 +1796,13 @@ ALTER TABLE ONLY public.evidence_automl_models ALTER COLUMN id SET DEFAULT nextv
ALTER TABLE ONLY public.evidence_hints ALTER COLUMN id SET DEFAULT nextval('public.evidence_hints_id_seq'::regclass);


--
-- Name: evidence_labeled_entries id; Type: DEFAULT; Schema: public; Owner: -
--

ALTER TABLE ONLY public.evidence_labeled_entries ALTER COLUMN id SET DEFAULT nextval('public.evidence_labeled_entries_id_seq'::regclass);


--
-- Name: evidence_prompt_healths id; Type: DEFAULT; Schema: public; Owner: -
--
Expand Down Expand Up @@ -2102,6 +2145,14 @@ ALTER TABLE ONLY public.evidence_hints
ADD CONSTRAINT evidence_hints_pkey PRIMARY KEY (id);


--
-- Name: evidence_labeled_entries evidence_labeled_entries_pkey; Type: CONSTRAINT; Schema: public; Owner: -
--

ALTER TABLE ONLY public.evidence_labeled_entries
ADD CONSTRAINT evidence_labeled_entries_pkey PRIMARY KEY (id);


--
-- Name: evidence_prompt_healths evidence_prompt_healths_pkey; Type: CONSTRAINT; Schema: public; Owner: -
--
Expand Down Expand Up @@ -2444,6 +2495,13 @@ CREATE INDEX index_evidence_automl_models_on_prompt_id ON public.evidence_automl
CREATE INDEX index_evidence_hints_on_rule_id ON public.evidence_hints USING btree (rule_id);


--
-- Name: index_evidence_labeled_entries_on_prompt_id; Type: INDEX; Schema: public; Owner: -
--

CREATE INDEX index_evidence_labeled_entries_on_prompt_id ON public.evidence_labeled_entries USING btree (prompt_id);


--
-- Name: index_evidence_prompt_healths_on_evidence_activity_health_id; Type: INDEX; Schema: public; Owner: -
--
Expand Down Expand Up @@ -2514,6 +2572,7 @@ ALTER TABLE ONLY public.comprehension_regex_rules
SET search_path TO "$user", public;

INSERT INTO "schema_migrations" (version) VALUES
('20240925184213'),
('20240918144745'),
('20240828221309'),
('20240823204315'),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# frozen_string_literal: true

# == Schema Information
#
# Table name: evidence_labeled_entries
#
# id :bigint not null, primary key
# approved :boolean
# embedding :vector(1536) not null
# entry :text not null
# label :text not null
# label_transformed :text not null
# metadata :jsonb
# created_at :datetime not null
# updated_at :datetime not null
# prompt_id :integer not null
#
# Indexes
#
# index_evidence_labeled_entries_on_prompt_id (prompt_id)
# index_evidence_labeled_entries_on_prompt_id_and_entry (prompt_id,entry) UNIQUE
#

FactoryBot.define do
factory :evidence_labeled_entry, class: 'Evidence::LabeledEntry' do
entry { Faker::Lorem.sentence }
embedding { Array.new(Evidence::LabeledEntry::DIMENSION) { rand(-1.0..1.0) } }
label { "Label_#{rand(0..10)}" }
label_transformed { label }

association :prompt, factory: :evidence_prompt
end
end
Loading