Skip to content

Commit

Permalink
Merge pull request #12427 from empirical-org/develop
Browse files Browse the repository at this point in the history
Add LabeledEntry model (#12422)
  • Loading branch information
brendanshean authored Sep 26, 2024
2 parents 90eb2ae + 57ecce3 commit b4d8a42
Show file tree
Hide file tree
Showing 9 changed files with 473 additions and 0 deletions.
1 change: 1 addition & 0 deletions services/QuillLMS/Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ gem 'dotenv-rails', '~> 2.6'
gem 'ancestry', '~> 3.0.5'
gem 'atomic_arrays', '~> 1.1.0'
gem 'bulk_insert', '~> 1.7'
gem 'neighbor'
gem 'pg', '1.4.2'
gem 'rails_admin', '~> 3.1.2'
gem 'ranked-model', '~> 0.4.3'
Expand Down
1 change: 1 addition & 0 deletions services/QuillLMS/Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,7 @@ DEPENDENCIES
maxminddb
memory_profiler
mini_racer (= 0.8.0)
neighbor
newrelic_rpm (~> 9.3.1)
nokogiri (>= 1.13.2)
omniauth
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# frozen_string_literal: true

# This migration comes from evidence (originally 20240925184213)
require 'neighbor'
class CreateLabeledEntries < ActiveRecord::Migration[7.1]
def change
create_table :evidence_labeled_entries do |t|
t.boolean :approved
t.text :entry, null: false
t.text :label, null: false
t.text :label_transformed, null: false
t.jsonb :metadata
t.integer :prompt_id, null: false
t.vector :embedding, limit: 1536, null: false

t.timestamps
end

add_index :evidence_labeled_entries, :prompt_id
add_index :evidence_labeled_entries, [:prompt_id, :entry], unique: true
end
end
67 changes: 67 additions & 0 deletions services/QuillLMS/db/structure.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2826,6 +2826,43 @@ CREATE SEQUENCE public.evidence_hints_id_seq
ALTER SEQUENCE public.evidence_hints_id_seq OWNED BY public.evidence_hints.id;


--
-- Name: evidence_labeled_entries; Type: TABLE; Schema: public; Owner: -
--

CREATE TABLE public.evidence_labeled_entries (
id bigint NOT NULL,
approved boolean,
entry text NOT NULL,
label text NOT NULL,
label_transformed text NOT NULL,
metadata jsonb,
prompt_id integer NOT NULL,
embedding public.vector(1536) NOT NULL,
created_at timestamp(6) without time zone NOT NULL,
updated_at timestamp(6) without time zone NOT NULL
);


--
-- Name: evidence_labeled_entries_id_seq; Type: SEQUENCE; Schema: public; Owner: -
--

CREATE SEQUENCE public.evidence_labeled_entries_id_seq
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;


--
-- Name: evidence_labeled_entries_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
--

ALTER SEQUENCE public.evidence_labeled_entries_id_seq OWNED BY public.evidence_labeled_entries.id;


--
-- Name: evidence_prompt_healths; Type: TABLE; Schema: public; Owner: -
--
Expand Down Expand Up @@ -6998,6 +7035,13 @@ ALTER TABLE ONLY public.evidence_automl_models ALTER COLUMN id SET DEFAULT nextv
ALTER TABLE ONLY public.evidence_hints ALTER COLUMN id SET DEFAULT nextval('public.evidence_hints_id_seq'::regclass);


--
-- Name: evidence_labeled_entries id; Type: DEFAULT; Schema: public; Owner: -
--

ALTER TABLE ONLY public.evidence_labeled_entries ALTER COLUMN id SET DEFAULT nextval('public.evidence_labeled_entries_id_seq'::regclass);


--
-- Name: evidence_prompt_healths id; Type: DEFAULT; Schema: public; Owner: -
--
Expand Down Expand Up @@ -8317,6 +8361,14 @@ ALTER TABLE ONLY public.evidence_hints
ADD CONSTRAINT evidence_hints_pkey PRIMARY KEY (id);


--
-- Name: evidence_labeled_entries evidence_labeled_entries_pkey; Type: CONSTRAINT; Schema: public; Owner: -
--

ALTER TABLE ONLY public.evidence_labeled_entries
ADD CONSTRAINT evidence_labeled_entries_pkey PRIMARY KEY (id);


--
-- Name: evidence_prompt_healths evidence_prompt_healths_pkey; Type: CONSTRAINT; Schema: public; Owner: -
--
Expand Down Expand Up @@ -9922,6 +9974,20 @@ CREATE INDEX index_evidence_automl_models_on_prompt_id ON public.evidence_automl
CREATE INDEX index_evidence_hints_on_rule_id ON public.evidence_hints USING btree (rule_id);


--
-- Name: index_evidence_labeled_entries_on_prompt_id; Type: INDEX; Schema: public; Owner: -
--

CREATE INDEX index_evidence_labeled_entries_on_prompt_id ON public.evidence_labeled_entries USING btree (prompt_id);


--
-- Name: index_evidence_labeled_entries_on_prompt_id_and_entry; Type: INDEX; Schema: public; Owner: -
--

CREATE UNIQUE INDEX index_evidence_labeled_entries_on_prompt_id_and_entry ON public.evidence_labeled_entries USING btree (prompt_id, entry);


--
-- Name: index_evidence_prompt_healths_on_evidence_activity_health_id; Type: INDEX; Schema: public; Owner: -
--
Expand Down Expand Up @@ -11660,6 +11726,7 @@ ALTER TABLE ONLY public.learn_worlds_account_course_events
SET search_path TO "$user", public;

INSERT INTO "schema_migrations" (version) VALUES
('20240925185730'),
('20240924151321'),
('20240924151311'),
('20240918144926'),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# frozen_string_literal: true

# == Schema Information
#
# Table name: evidence_labeled_entries
#
# id :bigint not null, primary key
# approved :boolean
# embedding :vector(1536) not null
# entry :text not null
# label :text not null
# label_transformed :text not null
# metadata :jsonb
# created_at :datetime not null
# updated_at :datetime not null
# prompt_id :integer not null
#
# Indexes
#
# index_evidence_labeled_entries_on_prompt_id (prompt_id)
# index_evidence_labeled_entries_on_prompt_id_and_entry (prompt_id,entry) UNIQUE
#

require 'neighbor'

module Evidence
class LabeledEntry < ApplicationRecord
# Dimension and model are coupled: https://platform.openai.com/docs/guides/embeddings
DIMENSION = 1536
MODEL = 'text-embedding-3-small'

DISTANCE_METRIC = 'cosine'
COLLAPSED_OPTIMAL_LABEL = 'Optimal'

belongs_to :prompt

has_neighbors :embedding

validates :embedding, presence: true
validates :label, presence: true
validates :label_transformed, presence: true
validates :prompt, presence: true
validates :entry, presence: true

before_validation :set_embedding, :set_transformed_label, :set_entry

def nearest_neighbor
nearest_neighbors(:embedding, distance: DISTANCE_METRIC)
.where(prompt_id:)
.first
end

def nearest_label
val = nearest_neighbor

{ distance: val&.neighbor_distance, label: val&.label }
end

private def set_entry
self.entry = entry.strip if entry.present?
end

private def set_embedding
return if entry.blank? || embedding.present?

self.embedding = Evidence::OpenAI::EmbeddingFetcher.run(dimension: DIMENSION, input: entry, model: MODEL)
end

private def set_transformed_label
if label.present? && label.match?(/\AOptimal_\d+\z/)
self.label_transformed = COLLAPSED_OPTIMAL_LABEL
else
self.label_transformed = label
end
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# frozen_string_literal: true

require 'neighbor'
class CreateLabeledEntries < ActiveRecord::Migration[7.1]
def change
create_table :evidence_labeled_entries do |t|
t.text :entry, null: false
t.text :label, null: false
t.text :label_transformed, null: false
t.jsonb :metadata
t.integer :prompt_id, null: false
t.vector :embedding, limit: 1536, null: false

t.timestamps
end

add_index :evidence_labeled_entries, :prompt_id
end
end
59 changes: 59 additions & 0 deletions services/QuillLMS/engines/evidence/spec/dummy/db/structure.sql
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,42 @@ CREATE SEQUENCE public.evidence_hints_id_seq
ALTER SEQUENCE public.evidence_hints_id_seq OWNED BY public.evidence_hints.id;


--
-- Name: evidence_labeled_entries; Type: TABLE; Schema: public; Owner: -
--

CREATE TABLE public.evidence_labeled_entries (
id bigint NOT NULL,
entry text NOT NULL,
label text NOT NULL,
label_transformed text NOT NULL,
metadata jsonb,
prompt_id integer NOT NULL,
embedding public.vector(1536) NOT NULL,
created_at timestamp(6) without time zone NOT NULL,
updated_at timestamp(6) without time zone NOT NULL
);


--
-- Name: evidence_labeled_entries_id_seq; Type: SEQUENCE; Schema: public; Owner: -
--

CREATE SEQUENCE public.evidence_labeled_entries_id_seq
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;


--
-- Name: evidence_labeled_entries_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
--

ALTER SEQUENCE public.evidence_labeled_entries_id_seq OWNED BY public.evidence_labeled_entries.id;


--
-- Name: evidence_prompt_healths; Type: TABLE; Schema: public; Owner: -
--
Expand Down Expand Up @@ -1760,6 +1796,13 @@ ALTER TABLE ONLY public.evidence_automl_models ALTER COLUMN id SET DEFAULT nextv
ALTER TABLE ONLY public.evidence_hints ALTER COLUMN id SET DEFAULT nextval('public.evidence_hints_id_seq'::regclass);


--
-- Name: evidence_labeled_entries id; Type: DEFAULT; Schema: public; Owner: -
--

ALTER TABLE ONLY public.evidence_labeled_entries ALTER COLUMN id SET DEFAULT nextval('public.evidence_labeled_entries_id_seq'::regclass);


--
-- Name: evidence_prompt_healths id; Type: DEFAULT; Schema: public; Owner: -
--
Expand Down Expand Up @@ -2102,6 +2145,14 @@ ALTER TABLE ONLY public.evidence_hints
ADD CONSTRAINT evidence_hints_pkey PRIMARY KEY (id);


--
-- Name: evidence_labeled_entries evidence_labeled_entries_pkey; Type: CONSTRAINT; Schema: public; Owner: -
--

ALTER TABLE ONLY public.evidence_labeled_entries
ADD CONSTRAINT evidence_labeled_entries_pkey PRIMARY KEY (id);


--
-- Name: evidence_prompt_healths evidence_prompt_healths_pkey; Type: CONSTRAINT; Schema: public; Owner: -
--
Expand Down Expand Up @@ -2444,6 +2495,13 @@ CREATE INDEX index_evidence_automl_models_on_prompt_id ON public.evidence_automl
CREATE INDEX index_evidence_hints_on_rule_id ON public.evidence_hints USING btree (rule_id);


--
-- Name: index_evidence_labeled_entries_on_prompt_id; Type: INDEX; Schema: public; Owner: -
--

CREATE INDEX index_evidence_labeled_entries_on_prompt_id ON public.evidence_labeled_entries USING btree (prompt_id);


--
-- Name: index_evidence_prompt_healths_on_evidence_activity_health_id; Type: INDEX; Schema: public; Owner: -
--
Expand Down Expand Up @@ -2514,6 +2572,7 @@ ALTER TABLE ONLY public.comprehension_regex_rules
SET search_path TO "$user", public;

INSERT INTO "schema_migrations" (version) VALUES
('20240925184213'),
('20240918144745'),
('20240828221309'),
('20240823204315'),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# frozen_string_literal: true

# == Schema Information
#
# Table name: evidence_labeled_entries
#
# id :bigint not null, primary key
# approved :boolean
# embedding :vector(1536) not null
# entry :text not null
# label :text not null
# label_transformed :text not null
# metadata :jsonb
# created_at :datetime not null
# updated_at :datetime not null
# prompt_id :integer not null
#
# Indexes
#
# index_evidence_labeled_entries_on_prompt_id (prompt_id)
# index_evidence_labeled_entries_on_prompt_id_and_entry (prompt_id,entry) UNIQUE
#

FactoryBot.define do
factory :evidence_labeled_entry, class: 'Evidence::LabeledEntry' do
entry { Faker::Lorem.sentence }
embedding { Array.new(Evidence::LabeledEntry::DIMENSION) { rand(-1.0..1.0) } }
label { "Label_#{rand(0..10)}" }
label_transformed { label }

association :prompt, factory: :evidence_prompt
end
end
Loading

0 comments on commit b4d8a42

Please sign in to comment.